-
Notifications
You must be signed in to change notification settings - Fork 5.2k
Closed
Description
Hey,
(To include maybe to #77010 ?)
Opening an issue to investigate the performance difference when running XxHash128 on ARM64 (here) while tracking the difference of performance with the native code, I have spotted 2 issues so far:
Missing combine eor+lsr
The following C# code (declared in XxHashShared.cs):
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static ulong XorShift(ulong value, int shift)
{
Debug.Assert(shift >= 0 && shift < 64);
return value ^ (value >> shift);
}is generating the following code:
D360FC20 lsr x0, x1, #32
CA000021 eor x1, x1, x0
while it could combine instructions like:
eor x1, x0, x0, lsr #32
You can see the impact in the following code dumped, as there are many eor+lsr combined operations.
Duplicate constant reloads
For some unknown reasons that I'm not able to reproduce on a synthetic test, it seems that e.g some code in HashLength9To16 is duplicating the load of several ulong consts:
7100203F cmp w1, #8
540007A9 bls G_M000_IG04
F9400002 ldr x2, [x0]
8B214000 add x0, x0, w1, UXTW
F85F8000 ldr x0, [x0,#-0x08]
CA000042 eor x2, x2, x0
D2846923 movz x3, #0x2349
F2A666C3 movk x3, #0x3336 LSL #16
F2C7E003 movk x3, #0x3F00 LSL #32
F2EB32E3 movk x3, #0x5997 LSL #48
CA030042 eor x2, x2, x3
D29950E3 movz x3, #0xCA87 ; <--- duplicate
F2B0BD63 movk x3, #0x85EB LSL #16
F2CF3623 movk x3, #0x79B1 LSL #32
F2F3C6E3 movk x3, #0x9E37 LSL #48
9B037C43 mul x3, x2, x3
D29950E4 movz x4, #0xCA87 ; <--- duplicate
F2B0BD64 movk x4, #0x85EB LSL #16
F2CF3624 movk x4, #0x79B1 LSL #32
F2F3C6E4 movk x4, #0x9E37 LSL #48
9BC47C42 umulh x2, x2, x4
51000421 sub w1, w1, #1
2A0103E1 mov w1, w1
8B01D863 add x3, x3, x1, LSL #54
D287AB01 movz x1, #0x3D58
F2B25AC1 movk x1, #0x92D6 LSL #16
F2CF2EC1 movk x1, #0x7976 LSL #32
F2F84041 movk x1, #0xC202 LSL #48
CA010000 eor x0, x0, x1
8B000042 add x2, x2, x0
2A0003E0 mov w0, w0
D2994EC1 movz x1, #0xCA76
F2B0BD61 movk x1, #0x85EB LSL #16
9B010802 madd x2, x0, x1, x2
DAC00C40 rev x0, x2
CA000063 eor x3, x3, x0
D29D69E0 movz x0, #0xEB4F
F2A4FA80 movk x0, #0x27D4 LSL #16
F2D5C7A0 movk x0, #0xAE3D LSL #32
F2F85640 movk x0, #0xC2B2 LSL #48
9B007C61 mul x1, x3, x0
9BC07C63 umulh x3, x3, x0
9B000C43 madd x3, x2, x0, x3
D365FC20 lsr x0, x1, #37
CA000022 eor x2, x1, x0
D28F3F20 movz x0, #0x79F9 ; <--- duplicate
F2B3C6E0 movk x0, #0x9E37 LSL #16
F2CCF220 movk x0, #0x6791 LSL #32
F2E2CAC0 movk x0, #0x1656 LSL #48
9B007C42 mul x2, x2, x0
D360FC40 lsr x0, x2, #32
CA000042 eor x2, x2, x0
D365FC60 lsr x0, x3, #37
CA000061 eor x1, x3, x0
D28F3F20 movz x0, #0x79F9 ; <--- duplicate
F2B3C6E0 movk x0, #0x9E37 LSL #16
F2CCF220 movk x0, #0x6791 LSL #32
F2E2CAC0 movk x0, #0x1656 LSL #48
9B007C21 mul x1, x1, x0
D360FC20 lsr x0, x1, #32
CA000021 eor x1, x1, x0
14000065 b G_M000_IG07
while the native version is generating the following:
ldr x8, [x0]
add x9, x0, x1
ldur x9, [x9, #-8]
eor x8, x8, x9
mov x10, #9033
movk x10, #13110, lsl #16
movk x10, #16128, lsl #32
movk x10, #22935, lsl #48
eor x8, x8, x10
mov x10, #51847
movk x10, #34283, lsl #16
movk x10, #31153, lsl #32
movk x10, #40503, lsl #48
umulh x11, x8, x10
mul x8, x8, x10
add x8, x8, x1, lsl #54
mov x10, #15704
movk x10, #37590, lsl #16
movk x10, #31094, lsl #32
movk x10, #49666, lsl #48
eor x9, x9, x10
and x10, x9, #0xffffffff
mov w12, #51830
movk w12, #34283, lsl #16
madd x9, x10, x12, x9
mov x10, #-18014398509481984
add x9, x9, x11
rev x11, x9
add x8, x8, x10
eor x8, x11, x8
mov x10, #60239
movk x10, #10196, lsl #16
movk x10, #44605, lsl #32
movk x10, #49842, lsl #48
mul x11, x8, x10
umulh x8, x8, x10
madd x8, x9, x10, x8
eor x9, x11, x11, lsr #37
mov x10, #31225
movk x10, #40503, lsl #16
movk x10, #26513, lsl #32
movk x10, #5718, lsl #48
mul x9, x9, x10
eor x0, x9, x9, lsr #32
eor x8, x8, x8, lsr #37
mul x8, x8, x10
eor x1, x8, x8, lsr #32
ret
Metadata
Metadata
Assignees
Labels
No labels