carlk3
Posts: 189
Joined: Wed Feb 17, 2021 8:46 pm

Stack Location

Sun Dec 10, 2023 2:39 am

[Splitting this off from "Troubles with Lua":]
carlk3 wrote:
Wed Dec 06, 2023 10:57 pm
DarkElvenAngel wrote:
Mon Sep 25, 2023 1:19 am
This is a helpful article, I see that if core 0 is going over the stack limit it could destroy the stack on core 1. I'm going to have to see what I can do about this...
...
Has anyone worked up a custom linker script to place the core0 stack at the bottom of memory that they would be willing to share?
Here is my whack at it:

Code: Select all

/* Based on GCC ARM embedded samples.
   Defines the following symbols for use by code:
    __exidx_start
    __exidx_end
    __etext
    __data_start__
    __preinit_array_start
    __preinit_array_end
    __init_array_start
    __init_array_end
    __fini_array_start
    __fini_array_end
    __data_end__
    __bss_start__
    __bss_end__
    __end__
    end
    __HeapLimit
    __StackLimit
    __StackTop
    __stack (== StackTop)
*/

MEMORY
{
    FLASH(rx) : ORIGIN = 0x10000000, LENGTH = 2048k
    /* Physical RAM is ORIGIN =  0x20000000, LENGTH = 0x40000 (256k) */
    SCRATCH_X(rwx) : ORIGIN = 0x20000000, LENGTH = 0x1000
    SCRATCH_Y(rwx) : ORIGIN = 0x20001000, LENGTH = 0x1000
    RAM(rwx) : ORIGIN =  0x20002000, LENGTH = 0x3E000 /* 0x40000 - 0x2000 */
    /* Last byte of physical RAM is at 0x2003FFFF */
}

ENTRY(_entry_point)

SECTIONS
{
    /* Second stage bootloader is prepended to the image. It must be 256 bytes big
       and checksummed. It is usually built by the boot_stage2 target
       in the Raspberry Pi Pico SDK
    */

    .flash_begin : {
        __flash_binary_start = .;
    } > FLASH

    .boot2 : {
        __boot2_start__ = .;
        KEEP (*(.boot2))
        __boot2_end__ = .;
    } > FLASH

    ASSERT(__boot2_end__ - __boot2_start__ == 256,
        "ERROR: Pico second stage bootloader must be 256 bytes in size")

    /* The second stage will always enter the image at the start of .text.
       The debugger will use the ELF entry point, which is the _entry_point
       symbol if present, otherwise defaults to start of .text.
       This can be used to transfer control back to the bootrom on debugger
       launches only, to perform proper flash setup.
    */

    .text : {
        __logical_binary_start = .;
        KEEP (*(.vectors))
        KEEP (*(.binary_info_header))
        __binary_info_header_end = .;
        KEEP (*(.reset))
        /* TODO revisit this now memset/memcpy/float in ROM */
        /* bit of a hack right now to exclude all floating point and time critical (e.g. memset, memcpy) code from
         * FLASH ... we will include any thing excluded here in .data below by default */
        *(.init)
        *(EXCLUDE_FILE(*libgcc.a: *libc.a:*lib_a-mem*.o *libm.a:) .text*)
        *(.fini)
        /* Pull all c'tors into .text */
        *crtbegin.o(.ctors)
        *crtbegin?.o(.ctors)
        *(EXCLUDE_FILE(*crtend?.o *crtend.o) .ctors)
        *(SORT(.ctors.*))
        *(.ctors)
        /* Followed by destructors */
        *crtbegin.o(.dtors)
        *crtbegin?.o(.dtors)
        *(EXCLUDE_FILE(*crtend?.o *crtend.o) .dtors)
        *(SORT(.dtors.*))
        *(.dtors)

        *(.eh_frame*)
        . = ALIGN(4);
    } > FLASH

    .rodata : {
        *(EXCLUDE_FILE(*libgcc.a: *libc.a:*lib_a-mem*.o *libm.a:) .rodata*)
        . = ALIGN(4);
        *(SORT_BY_ALIGNMENT(SORT_BY_NAME(.flashdata*)))
        . = ALIGN(4);
    } > FLASH

    .ARM.extab :
    {
        *(.ARM.extab* .gnu.linkonce.armextab.*)
    } > FLASH

    __exidx_start = .;
    .ARM.exidx :
    {
        *(.ARM.exidx* .gnu.linkonce.armexidx.*)
    } > FLASH
    __exidx_end = .;

    /* Machine inspectable binary information */
    . = ALIGN(4);
    __binary_info_start = .;
    .binary_info :
    {
        KEEP(*(.binary_info.keep.*))
        *(.binary_info.*)
    } > FLASH
    __binary_info_end = .;
    . = ALIGN(4);

   .ram_vector_table (NOLOAD): {
        *(.ram_vector_table)
    } > RAM

    .data : {
        __data_start__ = .;
        *(vtable)

        *(.time_critical*)

        /* remaining .text and .rodata; i.e. stuff we exclude above because we want it in RAM */
        *(.text*)
        . = ALIGN(4);
        *(.rodata*)
        . = ALIGN(4);

        *(.data*)

        . = ALIGN(4);
        *(.after_data.*)
        . = ALIGN(4);
        /* preinit data */
        PROVIDE_HIDDEN (__mutex_array_start = .);
        KEEP(*(SORT(.mutex_array.*)))
        KEEP(*(.mutex_array))
        PROVIDE_HIDDEN (__mutex_array_end = .);

        . = ALIGN(4);
        /* preinit data */
        PROVIDE_HIDDEN (__preinit_array_start = .);
        KEEP(*(SORT(.preinit_array.*)))
        KEEP(*(.preinit_array))
        PROVIDE_HIDDEN (__preinit_array_end = .);

        . = ALIGN(4);
        /* init data */
        PROVIDE_HIDDEN (__init_array_start = .);
        KEEP(*(SORT(.init_array.*)))
        KEEP(*(.init_array))
        PROVIDE_HIDDEN (__init_array_end = .);

        . = ALIGN(4);
        /* finit data */
        PROVIDE_HIDDEN (__fini_array_start = .);
        *(SORT(.fini_array.*))
        *(.fini_array)
        PROVIDE_HIDDEN (__fini_array_end = .);

        *(.jcr)
        . = ALIGN(4);
        /* All data end */
        __data_end__ = .;
    } > RAM AT> FLASH
    /* __etext is (for backwards compatibility) the name of the .data init source pointer (...) */
    __etext = LOADADDR(.data);

    .uninitialized_data (NOLOAD): {
        . = ALIGN(4);
        *(.uninitialized_data*)
    } > RAM

    /* Start and end symbols must be word-aligned */
    .scratch_x : {
        __scratch_x_start__ = .;
        *(.scratch_x.*)
        . = ALIGN(4);
        __scratch_x_end__ = .;
    } > SCRATCH_X AT > FLASH
    __scratch_x_source__ = LOADADDR(.scratch_x);

    .scratch_y : {
        __scratch_y_start__ = .;
        *(.scratch_y.*)
        . = ALIGN(4);
        __scratch_y_end__ = .;
    } > SCRATCH_Y AT > FLASH
    __scratch_y_source__ = LOADADDR(.scratch_y);

    .bss  : {
        . = ALIGN(4);
        __bss_start__ = .;
        *(SORT_BY_ALIGNMENT(SORT_BY_NAME(.bss*)))
        *(COMMON)
        . = ALIGN(4);
        __bss_end__ = .;
    } > RAM

    .heap (NOLOAD):
    {
        __end__ = .;
        end = __end__;
        KEEP(*(.heap*))
        __HeapLimit = .;
    } > RAM

    /* .stack*_dummy section doesn't contains any symbols. It is only
     * used for linker to calculate size of stack sections, and assign
     * values to stack symbols later
     *
     * stack1 section may be empty/missing if platform_launch_core1 is not used */

    .stack1_dummy (NOLOAD):
    {
        *(.stack1*)
    } > SCRATCH_Y /* Was SCRATCH_X */
    .stack_dummy (NOLOAD):
    {
        KEEP(*(.stack*))
    } > SCRATCH_X /* Was SCRATCH_Y */

    .flash_end : {
        PROVIDE(__flash_binary_end = .);
    } > FLASH

    /* stack limit is poorly named, but historically is maximum heap ptr */
    __StackLimit = ORIGIN(RAM) + LENGTH(RAM);
    __StackOneTop = ORIGIN(SCRATCH_Y) + LENGTH(SCRATCH_Y); /* Was SCRATCH_X */
    __StackTop = ORIGIN(SCRATCH_X) + LENGTH(SCRATCH_X); /* Was SCRATCH_Y */
    __StackOneBottom = __StackOneTop - SIZEOF(.stack1_dummy);
    __StackBottom = __StackTop - SIZEOF(.stack_dummy);
    PROVIDE(__stack = __StackTop);

    ASSERT( __binary_info_header_end - __logical_binary_start <= 256, "Binary info must be in first 256 bytes of the binary")
    /* todo assert on extra code */
}
I started with `memmap_default.ld` and swapped SCRATCH_X and SCRATCH_Y like "hippy", then moved SCRATCH_X and SCRATCH_Y to the beginning of RAM. Here are the diffs:

Code: Select all

diff memmap_default.ld memmap.ld
27,29c27,31
<     RAM(rwx) : ORIGIN =  0x20000000, LENGTH = 256k
<     SCRATCH_X(rwx) : ORIGIN = 0x20040000, LENGTH = 4k
<     SCRATCH_Y(rwx) : ORIGIN = 0x20041000, LENGTH = 4k
---
>     /* Physical RAM is ORIGIN =  0x20000000, LENGTH = 0x40000 (256k) */
>     SCRATCH_X(rwx) : ORIGIN = 0x20000000, LENGTH = 0x1000
>     SCRATCH_Y(rwx) : ORIGIN = 0x20001000, LENGTH = 0x1000
>     RAM(rwx) : ORIGIN =  0x20002000, LENGTH = 0x3E000 /* 0x40000 - 0x2000 */
>     /* Last byte of physical RAM is at 0x2003FFFF */
221,223d222
<     /* by default we put core 0 stack at the end of scratch Y, so that if core 1
<      * stack is not used then all of SCRATCH_X is free.
<      */
227c226
<     } > SCRATCH_X
---
>     } > SCRATCH_Y /* Was SCRATCH_X */
231c230
<     } > SCRATCH_Y
---
>     } > SCRATCH_X /* Was SCRATCH_Y */
239,240c238,239
<     __StackOneTop = ORIGIN(SCRATCH_X) + LENGTH(SCRATCH_X);
<     __StackTop = ORIGIN(SCRATCH_Y) + LENGTH(SCRATCH_Y);
---
>     __StackOneTop = ORIGIN(SCRATCH_Y) + LENGTH(SCRATCH_Y); /* Was SCRATCH_X */
>     __StackTop = ORIGIN(SCRATCH_X) + LENGTH(SCRATCH_X); /* Was SCRATCH_Y */
244,246d242
<
<     /* Check if data + heap + stack exceeds RAM limit */
<     ASSERT(__StackLimit >= __HeapLimit, "region RAM overflowed")
It seems to work. I do indeed get a Hard Fault if I try to put an excessively large array on the stack (which is what I was looking for, according to Miro Samek's advice).

I'm not sure at all about how these changes would interact with PICO_STACK_SIZE, PICO_HEAP_SIZE, and PICO_USE_STACK_GUARDS.

hippy
Posts: 15839
Joined: Fri Sep 09, 2011 10:34 pm
Location: UK

Re: Stack Location

Sun Dec 10, 2023 5:17 pm

The case for putting a downward growing stack at a low memory address, in the hopes of being more fail-safe than trampling over data in RAM, isn't entirely convincing IMO. It will however depend on the exact chip and architecture.

I guess decrementing SP to outside RAM should more quickly crash and burn than trampling over data RAM would, but by then the damage in the physical world may still already have been done.

The most obvious thing to do would be to hard-fault on any stack push outside the defined stack area. The trouble there is that it is not usually easy to do, only hard-fault on writes to memory may be available and it's not easy to tell if that's a stack push or an intended data write.

The solution is usually to hard-fault on a write to memory within the stack such that it hard-faults when the stack nears exhaustion rather than is exhausted. Nothing will be writing to that memory so, if it ever hard-faults, it's a stack push to a stack nearing exhaustion error. Can be considered a stack overflow.

That's what the RP2040 offers, what the Pico SDK supports, with what it calls 'stack guards'. So it doesn't seem to make any difference where the stack is, bottom, top, or even in the middle if one wants.

That also gets round the gnarly problem that, a multi-core, multi-stack, configuration can't have all its stacks at the same low-end of memory, allows them to all be contiguous.

And there are also good reasons to keeps the stacks in SCRATCH_X and SCRATCH_Y at top of memory. They allow continuous 256KB RAM to be used, and are in a different 'slice' which I recall has some advantage when it comes to RAM contention.

carlk3
Posts: 189
Joined: Wed Feb 17, 2021 8:46 pm

Re: Stack Location

Sun Dec 10, 2023 7:42 pm

Good points.
hippy wrote: ...
The most obvious thing to do would be to hard-fault on any stack push outside the defined stack area. The trouble there is that it is not usually easy to do, only hard-fault on writes to memory may be available and it's not easy to tell if that's a stack push or an intended data write.
That, to me, is the whole point of putting the stack at the bottom of RAM. You get an immediate HardFault exception the instant you try to overflow the stack. You may or may not want that in production code, but for testing it is great.

Also, there is a lot you can do with isr_hardfault(). For example, see Preserving debugging breadcrumbs across reboots in Cortex-M and m0FaultDispatch.
hippy wrote: The solution is usually to hard-fault on a write to memory within the stack such that it hard-faults when the stack nears exhaustion rather than is exhausted. Nothing will be writing to that memory so, if it ever hard-faults, it's a stack push to a stack nearing exhaustion error. Can be considered a stack overflow.
There are some alternatives for isr_hardfault(), For example,
cleverca22 wrote: you could reset the stackpointer to a known-reserved area upon entry, and use an asm wrapper like non-cortex-m platforms always needed

but there is also a function in the boot rom, to force a hard reset, then jump immediately to a given pc+sp, instead of running the boot2 loader in flash
hippy wrote: That's what the RP2040 offers, what the Pico SDK supports, with what it calls 'stack guards'. So it doesn't seem to make any difference where the stack is, bottom, top, or even in the middle if one wants.
I've only just got this working, and already I have found stack overflows that weren't detected by the 'stack guards'. I think the default setup purposely lets core0 stack overwrite core1 stack and then overwrite the heap. With PICO_USE_STACK_GUARDS=1 (which is not the default) I think it will flag an attempt to overwrite the heap area. Otherwise, the intent seems to be to avoid HardFault whenever possible. That may or may not be appropriate for the application (or for testing the application).
hippy wrote: That also gets round the gnarly problem that, a multi-core, multi-stack, configuration can't have all its stacks at the same low-end of memory, allows them to all be contiguous.
Agreed. I think it might be worth having a couple of linker scripts for use in testing; one with the core0 stack at the bottom of RAM, and one with core1 at the bottom of RAM.
hippy wrote: And there are also good reasons to keeps the stacks in SCRATCH_X and SCRATCH_Y at top of memory. They allow continuous 256KB RAM to be used, and are in a different 'slice' which I recall has some advantage when it comes to RAM contention.
Unfortunately, I am ignorant about that. I was actually just thinking of getting rid of SCRATCH_X and SCRATCH_Y entirely because I don't know what good they are doing me.

arg001
Posts: 606
Joined: Tue Jan 23, 2018 10:06 am

Re: Stack Location

Sun Dec 10, 2023 8:28 pm

The default stack guards are only 32-bytes wide, so (if enabled), they will detect many stack overflows but not a galloping overflow that puts some big structure on the stack (lying over the guard region) and then doesn't actually write to all of it.

It's easy enough to hack the implementation (rp2_common/pico_runtime/runtime.c) if you fancy a slightly wider guard region to give a greater chance of catching these (at the expense of more RAM wasted).

The default SDK setup does miss a trick here: with the two stacks for the two cores placed one after the other in memory, and with the MPU (used to implement the guard) being part of the CPU core hence separate per-core, the CPU with the higher-address stack could make the whole of the other stack an invalid region, giving a much wider stack guard at no cost. This does assume that code on one core doesn't pass pointers to its stack variables to code on the other core - but I think we can reasonably say "don't do that!".

If not wanting to hack the SDK, you could set up some additional MPU regions in your own startup code: each CPU could always invalidate the other core's stack, and maybe things like making any RAM-resident code read-only (not strictly related to stack overflows, but the MPU supports 8 regions so might as well use them to catch other sorts of bugs).

User avatar
adam_green
Posts: 80
Joined: Tue Dec 14, 2021 12:43 am

Re: Stack Location

Sun Dec 10, 2023 8:49 pm

arg001 wrote:
Sun Dec 10, 2023 8:28 pm
The default stack guards are only 32-bytes wide, so (if enabled), they will detect many stack overflows but not a galloping overflow that puts some big structure on the stack (lying over the guard region) and then doesn't actually write to all of it.
...
+1

I agree with everything arg001 has written here and I would add that placing the stack of the core that you think is more likely overflowing at the bottom of RAM makes it even easier to catch such overflows.

dthacher
Posts: 1020
Joined: Sun Jun 06, 2021 12:07 am

Re: Stack Location

Mon Dec 11, 2023 2:00 am

The other way to do this is with virtual addressing, by moving the actual stack. (The linear space exists in hardware but non linear space exists in software. Allowing access faults to undefined regions.)

Overall the programmer will likely need to avoid any chance of stack overflow manually. I see no real way to avoid this on microcontrollers. I somewhat assume the RP2040 to 128K IRAM and 4K DRAM per core but can allocate some of the IRAM to global data. Technically I can relocate the stack to heap which is in IRAM by default.

The RP2040 has two zones internally stripped (IRAM) and non stripped RAM (DRAM). The programmer is probably best advised to not allow DMA to the scratch register.

Honestly stack guards represent a casual notion of programming. You need to use secure proxy logic, static resource allocation and careful algorithm planning to prevent deep call stacks. This like heap fragmentation is mostly fatal, the only thing you can do is a graceful reboot.

SWD access can pretty much override anything, but what else can compromise a proper instruction stream? The only people that would be attempting to push on to the stack would be a hacker or a casual programmer?

cleverca22
Posts: 8615
Joined: Sat Aug 18, 2012 2:33 pm

Re: Stack Location

Mon Dec 11, 2023 2:28 am

dthacher wrote:
Mon Dec 11, 2023 2:00 am
The other way to do this is with virtual addressing, by moving the actual stack. (The linear space exists in hardware but non linear space exists in software. Allowing access faults to undefined regions.)
dont forget, the pico also has an MPU

it cant remap things like an MMU, but it can still fault if you try to write to certain ram locations
arg001 wrote:
Sun Dec 10, 2023 8:28 pm
The default stack guards are only 32-bytes wide, so (if enabled), they will detect many stack overflows but not a galloping overflow that puts some big structure on the stack (lying over the guard region) and then doesn't actually write to all of it.
and thats where some gcc stack smashing things come into play
on x86, a 4kb page is intentionally left unmapped at the bottom of the stack, so it will fault when full

and gcc is configured, so if you try to allocate >4kb on the stack, it will do a dummy read every 4kb, to step on any landmines (unmapped pages) and get caught

in theory, you could use the MPU to protect a 32byte range, and gcc could be configured to read in 32 byte steps during that protection

carlk3
Posts: 189
Joined: Wed Feb 17, 2021 8:46 pm

Re: Stack Location

Mon Dec 11, 2023 4:17 am

Armv8-M architecture has some nice stack limit checking features: for processors based on the Armv8-M Mainline architecture (like Cortex-M23* or M33), each of the stack pointers has a corresponding stack limit register which allows software to define watermark levels for stack overflow detection, and when stack overflow occurs, a Usage fault or HardFault exception is triggered.

* This is part of the optional Security Attribution Unit (SAU). However, it looks like the cheap chips (e.g., the Renesas RA2E2) have a "stack pointer monitor" as part of the Memory Protection Unit (MPU).

carlk3
Posts: 189
Joined: Wed Feb 17, 2021 8:46 pm

Re: Stack Location

Mon Dec 11, 2023 7:55 am

carlk3 wrote:
Sun Dec 10, 2023 7:42 pm
...
Also, there is a lot you can do with isr_hardfault(). For example, see Preserving debugging breadcrumbs across reboots in Cortex-M
...
This technique doesn't work so well when the cause of the HardFault is an overflow of the stack at the bottom of RAM. I guess that's because there is nowhere to push the stack frame on the way to the exception handler. (My derivative implementation at crash.c.)

carlk3
Posts: 189
Joined: Wed Feb 17, 2021 8:46 pm

Re: Stack Location

Tue Dec 12, 2023 6:56 pm

arg001 wrote:
Sun Dec 10, 2023 8:28 pm
The default stack guards are only 32-bytes wide, so (if enabled), they will detect many stack overflows but not a galloping overflow that puts some big structure on the stack (lying over the guard region) and then doesn't actually write to all of it.

It's easy enough to hack the implementation (rp2_common/pico_runtime/runtime.c) if you fancy a slightly wider guard region to give a greater chance of catching these (at the expense of more RAM wasted).
You mention "stack guards", and the (apparently undocumented) Configuration Parameter is called "PICO_USE_STACK_GUARDS", and that sounds plural, but looking at rp2_common/pico_runtime/runtime.c I only see a single stack guard being installed, for core0.

EDIT: Oh, never mind, I see that PICO_USE_STACK_GUARDS also makes an appearance in rp2_common\pico_multicore\multicore.c.
arg001 wrote:
Sun Dec 10, 2023 8:28 pm
The default SDK setup does miss a trick here: with the two stacks for the two cores placed one after the other in memory, and with the MPU (used to implement the guard) being part of the CPU core hence separate per-core, the CPU with the higher-address stack could make the whole of the other stack an invalid region, giving a much wider stack guard at no cost. This does assume that code on one core doesn't pass pointers to its stack variables to code on the other core - but I think we can reasonably say "don't do that!".

If not wanting to hack the SDK, you could set up some additional MPU regions in your own startup code: each CPU could always invalidate the other core's stack, and maybe things like making any RAM-resident code read-only (not strictly related to stack overflows, but the MPU supports 8 regions so might as well use them to catch other sorts of bugs).
Great ideas!

Return to “SDK”