[Arm64] Use stp and str (SIMD) for stack prolog zeroing  

Currently, [void CodeGen::genZeroInitFrame(int untrLclHi, int untrLclLo, regNumber initReg, bool* pInitRegZeroed)
](https://github.com/dotnet/runtime/blob/54906ea87c9d8ff3df0b341f02ae255fd58820bd/src/coreclr/src/jit/codegencommon.cpp#L6131) inlines a zeroing loop for frames larger than 10 machine words (80 bytes on Arm64). The loop uses a `wzr` or `xzr` register and `stp` or `str` instructions and can write up to 16 bytes of zeros at once.

Following ideas in https://github.com/dotnet/runtime/pull/32538 we can
1) zero-init a SIMD register `qReg`
2) use the register `qReg` instead of `xzr` with `stp qReg, qReg, [mem]` allowing to write up to 32 bytes of zeros to memory in one instruction. 

We can also consider increasing the upper boundary (i.e. 10 machine words) to some larger number.

It seems that Clang/LLVM uses similar way for initializing stack allocated structs. 
https://godbolt.org/z/8rKxvn

For example,
```c++
#include <string.h>

struct int32x4 
{
    int _1;
    int _2;
    int _3;
    int _4;
};

struct int32x8 
{
    int _1;
    int _2;
    int _3;
    int _4;
    int _5;
    int _6;
    int _7;
    int _8;
};

struct int32x16
{
    int _1;
    int _2;
    int _3;
    int _4;
    int _5;
    int _6;
    int _7;
    int _8;
    int _9;
    int _10;
    int _11;
    int _12;
    int _13;
    int _14;
    int _15;
    int _16;
};

void ZeroInt32x4(void* pDst, int cnt)
{
    int32x4 src = { };
    memcpy(pDst, &src, cnt);
}

void ZeroInt32x8(void* pDst, int cnt)
{
    int32x8 src = { };
    memcpy(pDst, &src, cnt);
}

void ZeroInt32x16(void* pDst, int cnt)
{
    int32x16 src = { };
    memcpy(pDst, &src, cnt);
}
```
would be compiled down to
```asm
ZeroInt32x4(void*, int):                     // @ZeroInt32x4(void*, int)
        sub     sp, sp, #32                     // =32
        stp     x29, x30, [sp, #16]             // 16-byte Folded Spill
        add     x29, sp, #16                    // =16
        sxtw    x2, w1
        mov     x1, sp
        stp     xzr, xzr, [sp]
        bl      memcpy
        ldp     x29, x30, [sp, #16]             // 16-byte Folded Reload
        add     sp, sp, #32                     // =32
        ret
ZeroInt32x8(void*, int):                     // @ZeroInt32x8(void*, int)
        sub     sp, sp, #48                     // =48
        stp     x29, x30, [sp, #32]             // 16-byte Folded Spill
        add     x29, sp, #32                    // =32
        movi    v0.2d, #0000000000000000
        sxtw    x2, w1
        mov     x1, sp
        stp     q0, q0, [sp]
        bl      memcpy
        ldp     x29, x30, [sp, #32]             // 16-byte Folded Reload
        add     sp, sp, #48                     // =48
        ret
ZeroInt32x16(void*, int):                    // @ZeroInt32x16(void*, int)
        sub     sp, sp, #80                     // =80
        stp     x29, x30, [sp, #64]             // 16-byte Folded Spill
        add     x29, sp, #64                    // =64
        movi    v0.2d, #0000000000000000
        sxtw    x2, w1
        mov     x1, sp
        stp     q0, q0, [sp, #32]
        stp     q0, q0, [sp]
        bl      memcpy
        ldp     x29, x30, [sp, #64]             // 16-byte Folded Reload
        add     sp, sp, #80                     // =80
        ret
```

@dotnet/jit-contrib @TamarChristinaArm 

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[Arm64] Use stp and str (SIMD) for stack prolog zeroing #43789

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

[Arm64] Use stp and str (SIMD) for stack prolog zeroing #43789

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions