Commit 4154d94
authored
Unrolled build for #149131
Rollup merge of #149131 - bend-n:optimize_slice_iter_next_chunk, r=Mark-Simulacrum
optimize `slice::Iter::next_chunk`
codegen for example (sourced from #98326 (comment))
```rust
pub fn simd_sum_slow(arr: &[u32]) -> u32 {
const STEP_SIZE: usize = 16;
let mut result = [0; STEP_SIZE];
let mut iter = arr.iter();
while let Ok(c) = iter.next_chunk::<STEP_SIZE>() {
for (&n, r) in c.iter().zip(result.iter_mut()) {
*r += n;
}
}
result.iter().sum()
}
```
goes from (znver4)
<details>
<summary>many asm</summary>
```assembly
simd_sum_slow:
.cfi_startproc
push rbp
.cfi_def_cfa_offset 16
push r15
.cfi_def_cfa_offset 24
push r14
.cfi_def_cfa_offset 32
push r13
.cfi_def_cfa_offset 40
push r12
.cfi_def_cfa_offset 48
push rbx
.cfi_def_cfa_offset 56
sub rsp, 240
.cfi_def_cfa_offset 296
.cfi_offset rbx, -56
.cfi_offset r12, -48
.cfi_offset r13, -40
.cfi_offset r14, -32
.cfi_offset r15, -24
.cfi_offset rbp, -16
lea r12, [rdi + 4*rsi]
mov qword ptr [rsp - 32], 0
mov dword ptr [rsp - 88], 0
mov dword ptr [rsp - 100], 0
mov dword ptr [rsp - 72], 0
mov dword ptr [rsp - 96], 0
mov dword ptr [rsp - 92], 0
mov dword ptr [rsp - 52], 0
mov dword ptr [rsp - 84], 0
mov dword ptr [rsp - 80], 0
mov dword ptr [rsp - 76], 0
mov dword ptr [rsp - 44], 0
mov dword ptr [rsp - 68], 0
mov dword ptr [rsp - 64], 0
mov dword ptr [rsp - 60], 0
mov dword ptr [rsp - 56], 0
mov dword ptr [rsp - 48], 0
mov qword ptr [rsp + 224], r12
cmp rdi, r12
mov qword ptr [rsp + 96], rdi
je .LBB12_2
.p2align 4
.LBB12_3:
lea r13, [rdi + 4]
cmp r13, r12
je .LBB12_4
lea r15, [rdi + 8]
cmp r15, r12
je .LBB12_6
lea r14, [rdi + 12]
cmp r14, r12
je .LBB12_8
lea rbx, [rdi + 16]
cmp rbx, r12
je .LBB12_10
lea r11, [rdi + 20]
cmp r11, r12
je .LBB12_12
lea r10, [rdi + 24]
cmp r10, r12
je .LBB12_14
lea r9, [rdi + 28]
cmp r9, r12
je .LBB12_16
lea r8, [rdi + 32]
cmp r8, r12
je .LBB12_18
lea rax, [rdi + 36]
cmp rax, r12
je .LBB12_20
mov qword ptr [rsp - 120], rax
lea rax, [rdi + 40]
mov qword ptr [rsp - 112], rax
cmp rax, r12
je .LBB12_22
lea rdx, [rdi + 44]
cmp rdx, r12
je .LBB12_24
lea rbp, [rdi + 48]
cmp rbp, r12
je .LBB12_26
mov qword ptr [rsp - 40], r9
lea rcx, [rdi + 52]
cmp rcx, r12
je .LBB12_30
lea r9, [rdi + 56]
cmp r9, r12
je .LBB12_32
lea rax, [rdi + 60]
cmp rax, r12
mov qword ptr [rsp - 24], r9
je .LBB12_34
mov qword ptr [rsp + 88], rax
lea rax, [rdi + 64]
mov qword ptr [rsp - 8], rax
mov dword ptr [rsp - 128], 0
mov qword ptr [rsp - 16], rdi
mov qword ptr [rsp + 56], r10
lea r10, [rsp + 216]
mov qword ptr [rsp + 40], r10
mov qword ptr [rsp + 96], r13
lea rax, [rsp + 208]
mov qword ptr [rsp + 32], rax
mov qword ptr [rsp + 72], rdx
lea rsi, [rsp + 200]
mov qword ptr [rsp + 48], rbx
lea rbx, [rsp + 192]
mov qword ptr [rsp + 80], rcx
lea rcx, [rsp + 184]
mov qword ptr [rsp + 64], rbp
lea rdx, [rsp + 176]
mov qword ptr [rsp + 16], r14
lea r14, [rsp + 168]
mov qword ptr [rsp + 24], r11
lea r11, [rsp + 160]
lea r9, [rsp + 152]
lea r12, [rsp + 144]
lea r13, [rsp + 136]
lea rbp, [rsp + 128]
mov qword ptr [rsp], r15
lea r15, [rsp + 120]
mov qword ptr [rsp + 8], r8
lea r8, [rsp + 112]
lea rdi, [rsp + 104]
jmp .LBB12_39
.p2align 4
.LBB12_2:
mov qword ptr [rsp - 128], 0
jmp .LBB12_37
.p2align 4
.LBB12_4:
mov eax, 1
mov qword ptr [rsp - 128], rax
jmp .LBB12_37
.p2align 4
.LBB12_6:
mov eax, 2
mov qword ptr [rsp - 128], rax
jmp .LBB12_36
.LBB12_8:
mov eax, 3
mov qword ptr [rsp - 128], rax
jmp .LBB12_36
.LBB12_10:
mov eax, 4
mov qword ptr [rsp - 128], rax
jmp .LBB12_36
.LBB12_12:
mov eax, 5
mov qword ptr [rsp - 128], rax
jmp .LBB12_36
.LBB12_14:
mov eax, 6
mov qword ptr [rsp - 128], rax
jmp .LBB12_36
.LBB12_16:
mov eax, 7
mov qword ptr [rsp - 128], rax
jmp .LBB12_36
.LBB12_18:
mov eax, 8
mov qword ptr [rsp - 128], rax
jmp .LBB12_36
.LBB12_20:
mov eax, 9
mov qword ptr [rsp - 128], rax
jmp .LBB12_36
.LBB12_22:
mov eax, 10
mov qword ptr [rsp - 128], rax
mov rdi, qword ptr [rsp - 120]
jmp .LBB12_36
.LBB12_24:
mov eax, 11
mov qword ptr [rsp - 128], rax
jmp .LBB12_27
.LBB12_26:
mov eax, 12
mov qword ptr [rsp - 128], rax
.LBB12_27:
mov rsi, qword ptr [rsp - 112]
mov rdi, qword ptr [rsp - 120]
jmp .LBB12_36
.LBB12_30:
mov rcx, rbp
mov eax, 13
mov qword ptr [rsp - 128], rax
jmp .LBB12_35
.LBB12_32:
mov rax, rcx
mov rcx, rbp
mov esi, 14
mov qword ptr [rsp - 128], rsi
jmp .LBB12_35
.LBB12_34:
mov rax, rcx
mov rcx, rbp
mov esi, 15
mov qword ptr [rsp - 128], rsi
.LBB12_35:
mov rsi, qword ptr [rsp - 112]
mov rdi, qword ptr [rsp - 120]
mov r9, qword ptr [rsp - 40]
.p2align 4
.LBB12_36:
mov rbp, r13
.LBB12_37:
mov r13, qword ptr [rsp - 24]
mov qword ptr [rsp + 88], r13
mov r13, qword ptr [rsp - 128]
mov qword ptr [rsp + 216], r13
mov r13b, 1
mov dword ptr [rsp - 128], r13d
mov qword ptr [rsp - 24], rax
mov qword ptr [rsp + 80], rcx
mov qword ptr [rsp + 64], rdx
mov qword ptr [rsp + 72], rsi
mov qword ptr [rsp - 112], rdi
mov qword ptr [rsp - 120], r8
mov qword ptr [rsp + 8], r9
mov qword ptr [rsp - 40], r10
mov qword ptr [rsp + 56], r11
mov qword ptr [rsp + 24], rbx
mov qword ptr [rsp + 48], r14
mov qword ptr [rsp + 16], r15
mov qword ptr [rsp], rbp
mov eax, 0
mov qword ptr [rsp - 16], rax
mov qword ptr [rsp - 8], r12
lea r10, [rsp + 208]
mov qword ptr [rsp + 40], r10
lea rax, [rsp + 200]
mov qword ptr [rsp + 32], rax
lea rsi, [rsp + 192]
lea rbx, [rsp + 184]
lea rcx, [rsp + 176]
lea rdx, [rsp + 168]
lea r14, [rsp + 160]
lea r11, [rsp + 152]
lea r9, [rsp + 144]
lea r12, [rsp + 136]
lea r13, [rsp + 128]
lea rbp, [rsp + 120]
lea r15, [rsp + 112]
lea r8, [rsp + 104]
lea rdi, [rsp + 232]
.LBB12_39:
mov r10, qword ptr [rsp + 96]
mov rax, qword ptr [rsp + 40]
mov qword ptr [rax], r10
mov rax, qword ptr [rsp + 32]
mov r10, qword ptr [rsp]
mov qword ptr [rax], r10
mov rax, qword ptr [rsp + 16]
mov qword ptr [rsi], rax
mov rax, qword ptr [rsp + 48]
mov qword ptr [rbx], rax
mov rax, qword ptr [rsp + 24]
mov qword ptr [rcx], rax
mov rax, qword ptr [rsp + 56]
mov qword ptr [rdx], rax
mov rax, qword ptr [rsp - 40]
mov qword ptr [r14], rax
mov rax, qword ptr [rsp + 8]
mov qword ptr [r11], rax
mov rax, qword ptr [rsp - 120]
mov qword ptr [r9], rax
mov rax, qword ptr [rsp - 112]
mov qword ptr [r12], rax
mov rax, qword ptr [rsp + 72]
mov qword ptr [r13], rax
mov rax, qword ptr [rsp + 64]
mov qword ptr [rbp], rax
mov rax, qword ptr [rsp + 80]
mov qword ptr [r15], rax
mov rax, qword ptr [rsp - 24]
mov qword ptr [r8], rax
mov rax, qword ptr [rsp + 88]
mov qword ptr [rdi], rax
cmp byte ptr [rsp - 128], 0
jne .LBB12_40
mov rax, qword ptr [rsp - 32]
mov rcx, qword ptr [rsp - 16]
add eax, dword ptr [rcx]
mov qword ptr [rsp - 32], rax
mov rax, qword ptr [rsp + 216]
mov ecx, dword ptr [rsp - 88]
add ecx, dword ptr [rax]
mov dword ptr [rsp - 88], ecx
mov rax, qword ptr [rsp + 208]
mov ecx, dword ptr [rsp - 100]
add ecx, dword ptr [rax]
mov dword ptr [rsp - 100], ecx
mov rax, qword ptr [rsp + 200]
mov ecx, dword ptr [rsp - 72]
add ecx, dword ptr [rax]
mov dword ptr [rsp - 72], ecx
mov rax, qword ptr [rsp + 192]
mov ecx, dword ptr [rsp - 96]
add ecx, dword ptr [rax]
mov dword ptr [rsp - 96], ecx
mov rax, qword ptr [rsp + 184]
mov ecx, dword ptr [rsp - 92]
add ecx, dword ptr [rax]
mov dword ptr [rsp - 92], ecx
mov rax, qword ptr [rsp + 176]
mov ecx, dword ptr [rsp - 52]
add ecx, dword ptr [rax]
mov dword ptr [rsp - 52], ecx
mov rax, qword ptr [rsp + 168]
mov ecx, dword ptr [rsp - 84]
add ecx, dword ptr [rax]
mov dword ptr [rsp - 84], ecx
mov rax, qword ptr [rsp + 160]
mov ecx, dword ptr [rsp - 80]
add ecx, dword ptr [rax]
mov dword ptr [rsp - 80], ecx
mov rax, qword ptr [rsp + 152]
mov ecx, dword ptr [rsp - 76]
add ecx, dword ptr [rax]
mov dword ptr [rsp - 76], ecx
mov rax, qword ptr [rsp + 144]
mov ecx, dword ptr [rsp - 44]
add ecx, dword ptr [rax]
mov dword ptr [rsp - 44], ecx
mov rax, qword ptr [rsp + 136]
mov ecx, dword ptr [rsp - 68]
add ecx, dword ptr [rax]
mov dword ptr [rsp - 68], ecx
mov rax, qword ptr [rsp + 128]
mov ecx, dword ptr [rsp - 64]
add ecx, dword ptr [rax]
mov dword ptr [rsp - 64], ecx
mov rax, qword ptr [rsp + 120]
mov ecx, dword ptr [rsp - 60]
add ecx, dword ptr [rax]
mov dword ptr [rsp - 60], ecx
mov rax, qword ptr [rsp + 112]
mov ecx, dword ptr [rsp - 56]
add ecx, dword ptr [rax]
mov dword ptr [rsp - 56], ecx
mov rax, qword ptr [rsp + 104]
mov ecx, dword ptr [rsp - 48]
add ecx, dword ptr [rax]
mov dword ptr [rsp - 48], ecx
mov rdi, qword ptr [rsp - 8]
mov r12, qword ptr [rsp + 224]
cmp rdi, r12
mov qword ptr [rsp + 96], rdi
jne .LBB12_3
jmp .LBB12_2
.LBB12_40:
mov eax, dword ptr [rsp - 88]
add eax, dword ptr [rsp - 32]
mov ecx, dword ptr [rsp - 72]
add ecx, dword ptr [rsp - 100]
add ecx, eax
mov edx, dword ptr [rsp - 92]
add edx, dword ptr [rsp - 96]
mov eax, dword ptr [rsp - 52]
add eax, edx
add eax, ecx
mov ecx, dword ptr [rsp - 80]
add ecx, dword ptr [rsp - 84]
mov edx, dword ptr [rsp - 76]
add edx, ecx
mov ecx, dword ptr [rsp - 44]
add ecx, edx
add ecx, eax
mov edx, dword ptr [rsp - 64]
add edx, dword ptr [rsp - 68]
mov eax, dword ptr [rsp - 60]
add eax, edx
mov edx, dword ptr [rsp - 56]
add edx, eax
mov eax, dword ptr [rsp - 48]
add eax, edx
add eax, ecx
add rsp, 240
.cfi_def_cfa_offset 56
pop rbx
.cfi_def_cfa_offset 48
pop r12
.cfi_def_cfa_offset 40
pop r13
.cfi_def_cfa_offset 32
pop r14
.cfi_def_cfa_offset 24
pop r15
.cfi_def_cfa_offset 16
pop rbp
.cfi_def_cfa_offset 8
ret
```
</details>
to
```assembly
simd_sum_slow:
.cfi_startproc
xor eax, eax
cmp rsi, 16
jb .LBB12_4
shl rsi, 2
pxor xmm0, xmm0
pxor xmm1, xmm1
pxor xmm3, xmm3
pxor xmm2, xmm2
.p2align 4
.LBB12_2:
movdqu xmm4, xmmword ptr [rdi]
paddd xmm0, xmm4
movdqu xmm4, xmmword ptr [rdi + 16]
paddd xmm1, xmm4
movdqu xmm4, xmmword ptr [rdi + 32]
paddd xmm3, xmm4
movdqu xmm4, xmmword ptr [rdi + 48]
paddd xmm2, xmm4
add rdi, 64
add rsi, -64
cmp rsi, 60
ja .LBB12_2
paddd xmm0, xmm3
paddd xmm1, xmm2
paddd xmm1, xmm0
pshufd xmm0, xmm1, 238
paddd xmm0, xmm1
pshufd xmm1, xmm0, 85
paddd xmm1, xmm0
movd eax, xmm1
.LBB12_4:
ret
```2 files changed
+26
-2
lines changed| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
139 | 139 | | |
140 | 140 | | |
141 | 141 | | |
142 | | - | |
| 142 | + | |
143 | 143 | | |
144 | 144 | | |
145 | 145 | | |
| |||
368 | 368 | | |
369 | 369 | | |
370 | 370 | | |
371 | | - | |
| 371 | + | |
372 | 372 | | |
373 | 373 | | |
374 | 374 | | |
| |||
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
68 | 68 | | |
69 | 69 | | |
70 | 70 | | |
| 71 | + | |
71 | 72 | | |
72 | 73 | | |
73 | 74 | | |
| |||
190 | 191 | | |
191 | 192 | | |
192 | 193 | | |
| 194 | + | |
| 195 | + | |
| 196 | + | |
| 197 | + | |
| 198 | + | |
| 199 | + | |
| 200 | + | |
| 201 | + | |
| 202 | + | |
| 203 | + | |
| 204 | + | |
| 205 | + | |
| 206 | + | |
| 207 | + | |
| 208 | + | |
| 209 | + | |
| 210 | + | |
| 211 | + | |
| 212 | + | |
| 213 | + | |
| 214 | + | |
| 215 | + | |
| 216 | + | |
193 | 217 | | |
194 | 218 | | |
195 | 219 | | |
| |||
0 commit comments