-
Notifications
You must be signed in to change notification settings - Fork 5.1k
Closed
Labels
arch-arm64area-CodeGen-coreclrCLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMICLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMIin-prThere is an active PR which will close this issue when it is mergedThere is an active PR which will close this issue when it is merged
Milestone
Description
Currently, void CodeGen::genZeroInitFrame(int untrLclHi, int untrLclLo, regNumber initReg, bool* pInitRegZeroed)
inlines a zeroing loop for frames larger than 10 machine words (80 bytes on Arm64). The loop uses a wzr
or xzr
register and stp
or str
instructions and can write up to 16 bytes of zeros at once.
Following ideas in #32538 we can
- zero-init a SIMD register
qReg
- use the register
qReg
instead ofxzr
withstp qReg, qReg, [mem]
allowing to write up to 32 bytes of zeros to memory in one instruction.
We can also consider increasing the upper boundary (i.e. 10 machine words) to some larger number.
It seems that Clang/LLVM uses similar way for initializing stack allocated structs.
https://godbolt.org/z/8rKxvn
For example,
#include <string.h>
struct int32x4
{
int _1;
int _2;
int _3;
int _4;
};
struct int32x8
{
int _1;
int _2;
int _3;
int _4;
int _5;
int _6;
int _7;
int _8;
};
struct int32x16
{
int _1;
int _2;
int _3;
int _4;
int _5;
int _6;
int _7;
int _8;
int _9;
int _10;
int _11;
int _12;
int _13;
int _14;
int _15;
int _16;
};
void ZeroInt32x4(void* pDst, int cnt)
{
int32x4 src = { };
memcpy(pDst, &src, cnt);
}
void ZeroInt32x8(void* pDst, int cnt)
{
int32x8 src = { };
memcpy(pDst, &src, cnt);
}
void ZeroInt32x16(void* pDst, int cnt)
{
int32x16 src = { };
memcpy(pDst, &src, cnt);
}
would be compiled down to
ZeroInt32x4(void*, int): // @ZeroInt32x4(void*, int)
sub sp, sp, #32 // =32
stp x29, x30, [sp, #16] // 16-byte Folded Spill
add x29, sp, #16 // =16
sxtw x2, w1
mov x1, sp
stp xzr, xzr, [sp]
bl memcpy
ldp x29, x30, [sp, #16] // 16-byte Folded Reload
add sp, sp, #32 // =32
ret
ZeroInt32x8(void*, int): // @ZeroInt32x8(void*, int)
sub sp, sp, #48 // =48
stp x29, x30, [sp, #32] // 16-byte Folded Spill
add x29, sp, #32 // =32
movi v0.2d, #0000000000000000
sxtw x2, w1
mov x1, sp
stp q0, q0, [sp]
bl memcpy
ldp x29, x30, [sp, #32] // 16-byte Folded Reload
add sp, sp, #48 // =48
ret
ZeroInt32x16(void*, int): // @ZeroInt32x16(void*, int)
sub sp, sp, #80 // =80
stp x29, x30, [sp, #64] // 16-byte Folded Spill
add x29, sp, #64 // =64
movi v0.2d, #0000000000000000
sxtw x2, w1
mov x1, sp
stp q0, q0, [sp, #32]
stp q0, q0, [sp]
bl memcpy
ldp x29, x30, [sp, #64] // 16-byte Folded Reload
add sp, sp, #80 // =80
ret
@dotnet/jit-contrib @TamarChristinaArm
Metadata
Metadata
Assignees
Labels
arch-arm64area-CodeGen-coreclrCLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMICLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMIin-prThere is an active PR which will close this issue when it is mergedThere is an active PR which will close this issue when it is merged
Type
Projects
Status
Done