-
Notifications
You must be signed in to change notification settings - Fork 14k
optimize slice::Iter::next_chunk
#149131
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
optimize slice::Iter::next_chunk
#149131
Conversation
|
rustbot has assigned @Mark-Simulacrum. Use |
305559b to
5df6fad
Compare
| .$array_ref(); // must convert &[T; N] to [&T; N] | ||
| Ok(r) | ||
| } else { | ||
| // cant use copy_nonoverlapping due to sadness |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Say more about sadness?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
i agree with Markus
|
Reminder, once the PR becomes ready for a review, use |
5df6fad to
a3909f5
Compare
|
This PR was rebased onto a different main commit. Here's a range-diff highlighting what actually changed. Rebasing is a normal part of keeping PRs up to date, so no action is needed—this note is just to help reviewers. |
a3909f5 to
9a48fb6
Compare
|
@rustbot ready |
|
@bors r+ |
…unk, r=Mark-Simulacrum optimize `slice::Iter::next_chunk` codegen for example (sourced from rust-lang#98326 (comment)) ```rust pub fn simd_sum_slow(arr: &[u32]) -> u32 { const STEP_SIZE: usize = 16; let mut result = [0; STEP_SIZE]; let mut iter = arr.iter(); while let Ok(c) = iter.next_chunk::<STEP_SIZE>() { for (&n, r) in c.iter().zip(result.iter_mut()) { *r += n; } } result.iter().sum() } ``` goes from (znver4) <details> <summary>many asm</summary> ```assembly simd_sum_slow: .cfi_startproc push rbp .cfi_def_cfa_offset 16 push r15 .cfi_def_cfa_offset 24 push r14 .cfi_def_cfa_offset 32 push r13 .cfi_def_cfa_offset 40 push r12 .cfi_def_cfa_offset 48 push rbx .cfi_def_cfa_offset 56 sub rsp, 240 .cfi_def_cfa_offset 296 .cfi_offset rbx, -56 .cfi_offset r12, -48 .cfi_offset r13, -40 .cfi_offset r14, -32 .cfi_offset r15, -24 .cfi_offset rbp, -16 lea r12, [rdi + 4*rsi] mov qword ptr [rsp - 32], 0 mov dword ptr [rsp - 88], 0 mov dword ptr [rsp - 100], 0 mov dword ptr [rsp - 72], 0 mov dword ptr [rsp - 96], 0 mov dword ptr [rsp - 92], 0 mov dword ptr [rsp - 52], 0 mov dword ptr [rsp - 84], 0 mov dword ptr [rsp - 80], 0 mov dword ptr [rsp - 76], 0 mov dword ptr [rsp - 44], 0 mov dword ptr [rsp - 68], 0 mov dword ptr [rsp - 64], 0 mov dword ptr [rsp - 60], 0 mov dword ptr [rsp - 56], 0 mov dword ptr [rsp - 48], 0 mov qword ptr [rsp + 224], r12 cmp rdi, r12 mov qword ptr [rsp + 96], rdi je .LBB12_2 .p2align 4 .LBB12_3: lea r13, [rdi + 4] cmp r13, r12 je .LBB12_4 lea r15, [rdi + 8] cmp r15, r12 je .LBB12_6 lea r14, [rdi + 12] cmp r14, r12 je .LBB12_8 lea rbx, [rdi + 16] cmp rbx, r12 je .LBB12_10 lea r11, [rdi + 20] cmp r11, r12 je .LBB12_12 lea r10, [rdi + 24] cmp r10, r12 je .LBB12_14 lea r9, [rdi + 28] cmp r9, r12 je .LBB12_16 lea r8, [rdi + 32] cmp r8, r12 je .LBB12_18 lea rax, [rdi + 36] cmp rax, r12 je .LBB12_20 mov qword ptr [rsp - 120], rax lea rax, [rdi + 40] mov qword ptr [rsp - 112], rax cmp rax, r12 je .LBB12_22 lea rdx, [rdi + 44] cmp rdx, r12 je .LBB12_24 lea rbp, [rdi + 48] cmp rbp, r12 je .LBB12_26 mov qword ptr [rsp - 40], r9 lea rcx, [rdi + 52] cmp rcx, r12 je .LBB12_30 lea r9, [rdi + 56] cmp r9, r12 je .LBB12_32 lea rax, [rdi + 60] cmp rax, r12 mov qword ptr [rsp - 24], r9 je .LBB12_34 mov qword ptr [rsp + 88], rax lea rax, [rdi + 64] mov qword ptr [rsp - 8], rax mov dword ptr [rsp - 128], 0 mov qword ptr [rsp - 16], rdi mov qword ptr [rsp + 56], r10 lea r10, [rsp + 216] mov qword ptr [rsp + 40], r10 mov qword ptr [rsp + 96], r13 lea rax, [rsp + 208] mov qword ptr [rsp + 32], rax mov qword ptr [rsp + 72], rdx lea rsi, [rsp + 200] mov qword ptr [rsp + 48], rbx lea rbx, [rsp + 192] mov qword ptr [rsp + 80], rcx lea rcx, [rsp + 184] mov qword ptr [rsp + 64], rbp lea rdx, [rsp + 176] mov qword ptr [rsp + 16], r14 lea r14, [rsp + 168] mov qword ptr [rsp + 24], r11 lea r11, [rsp + 160] lea r9, [rsp + 152] lea r12, [rsp + 144] lea r13, [rsp + 136] lea rbp, [rsp + 128] mov qword ptr [rsp], r15 lea r15, [rsp + 120] mov qword ptr [rsp + 8], r8 lea r8, [rsp + 112] lea rdi, [rsp + 104] jmp .LBB12_39 .p2align 4 .LBB12_2: mov qword ptr [rsp - 128], 0 jmp .LBB12_37 .p2align 4 .LBB12_4: mov eax, 1 mov qword ptr [rsp - 128], rax jmp .LBB12_37 .p2align 4 .LBB12_6: mov eax, 2 mov qword ptr [rsp - 128], rax jmp .LBB12_36 .LBB12_8: mov eax, 3 mov qword ptr [rsp - 128], rax jmp .LBB12_36 .LBB12_10: mov eax, 4 mov qword ptr [rsp - 128], rax jmp .LBB12_36 .LBB12_12: mov eax, 5 mov qword ptr [rsp - 128], rax jmp .LBB12_36 .LBB12_14: mov eax, 6 mov qword ptr [rsp - 128], rax jmp .LBB12_36 .LBB12_16: mov eax, 7 mov qword ptr [rsp - 128], rax jmp .LBB12_36 .LBB12_18: mov eax, 8 mov qword ptr [rsp - 128], rax jmp .LBB12_36 .LBB12_20: mov eax, 9 mov qword ptr [rsp - 128], rax jmp .LBB12_36 .LBB12_22: mov eax, 10 mov qword ptr [rsp - 128], rax mov rdi, qword ptr [rsp - 120] jmp .LBB12_36 .LBB12_24: mov eax, 11 mov qword ptr [rsp - 128], rax jmp .LBB12_27 .LBB12_26: mov eax, 12 mov qword ptr [rsp - 128], rax .LBB12_27: mov rsi, qword ptr [rsp - 112] mov rdi, qword ptr [rsp - 120] jmp .LBB12_36 .LBB12_30: mov rcx, rbp mov eax, 13 mov qword ptr [rsp - 128], rax jmp .LBB12_35 .LBB12_32: mov rax, rcx mov rcx, rbp mov esi, 14 mov qword ptr [rsp - 128], rsi jmp .LBB12_35 .LBB12_34: mov rax, rcx mov rcx, rbp mov esi, 15 mov qword ptr [rsp - 128], rsi .LBB12_35: mov rsi, qword ptr [rsp - 112] mov rdi, qword ptr [rsp - 120] mov r9, qword ptr [rsp - 40] .p2align 4 .LBB12_36: mov rbp, r13 .LBB12_37: mov r13, qword ptr [rsp - 24] mov qword ptr [rsp + 88], r13 mov r13, qword ptr [rsp - 128] mov qword ptr [rsp + 216], r13 mov r13b, 1 mov dword ptr [rsp - 128], r13d mov qword ptr [rsp - 24], rax mov qword ptr [rsp + 80], rcx mov qword ptr [rsp + 64], rdx mov qword ptr [rsp + 72], rsi mov qword ptr [rsp - 112], rdi mov qword ptr [rsp - 120], r8 mov qword ptr [rsp + 8], r9 mov qword ptr [rsp - 40], r10 mov qword ptr [rsp + 56], r11 mov qword ptr [rsp + 24], rbx mov qword ptr [rsp + 48], r14 mov qword ptr [rsp + 16], r15 mov qword ptr [rsp], rbp mov eax, 0 mov qword ptr [rsp - 16], rax mov qword ptr [rsp - 8], r12 lea r10, [rsp + 208] mov qword ptr [rsp + 40], r10 lea rax, [rsp + 200] mov qword ptr [rsp + 32], rax lea rsi, [rsp + 192] lea rbx, [rsp + 184] lea rcx, [rsp + 176] lea rdx, [rsp + 168] lea r14, [rsp + 160] lea r11, [rsp + 152] lea r9, [rsp + 144] lea r12, [rsp + 136] lea r13, [rsp + 128] lea rbp, [rsp + 120] lea r15, [rsp + 112] lea r8, [rsp + 104] lea rdi, [rsp + 232] .LBB12_39: mov r10, qword ptr [rsp + 96] mov rax, qword ptr [rsp + 40] mov qword ptr [rax], r10 mov rax, qword ptr [rsp + 32] mov r10, qword ptr [rsp] mov qword ptr [rax], r10 mov rax, qword ptr [rsp + 16] mov qword ptr [rsi], rax mov rax, qword ptr [rsp + 48] mov qword ptr [rbx], rax mov rax, qword ptr [rsp + 24] mov qword ptr [rcx], rax mov rax, qword ptr [rsp + 56] mov qword ptr [rdx], rax mov rax, qword ptr [rsp - 40] mov qword ptr [r14], rax mov rax, qword ptr [rsp + 8] mov qword ptr [r11], rax mov rax, qword ptr [rsp - 120] mov qword ptr [r9], rax mov rax, qword ptr [rsp - 112] mov qword ptr [r12], rax mov rax, qword ptr [rsp + 72] mov qword ptr [r13], rax mov rax, qword ptr [rsp + 64] mov qword ptr [rbp], rax mov rax, qword ptr [rsp + 80] mov qword ptr [r15], rax mov rax, qword ptr [rsp - 24] mov qword ptr [r8], rax mov rax, qword ptr [rsp + 88] mov qword ptr [rdi], rax cmp byte ptr [rsp - 128], 0 jne .LBB12_40 mov rax, qword ptr [rsp - 32] mov rcx, qword ptr [rsp - 16] add eax, dword ptr [rcx] mov qword ptr [rsp - 32], rax mov rax, qword ptr [rsp + 216] mov ecx, dword ptr [rsp - 88] add ecx, dword ptr [rax] mov dword ptr [rsp - 88], ecx mov rax, qword ptr [rsp + 208] mov ecx, dword ptr [rsp - 100] add ecx, dword ptr [rax] mov dword ptr [rsp - 100], ecx mov rax, qword ptr [rsp + 200] mov ecx, dword ptr [rsp - 72] add ecx, dword ptr [rax] mov dword ptr [rsp - 72], ecx mov rax, qword ptr [rsp + 192] mov ecx, dword ptr [rsp - 96] add ecx, dword ptr [rax] mov dword ptr [rsp - 96], ecx mov rax, qword ptr [rsp + 184] mov ecx, dword ptr [rsp - 92] add ecx, dword ptr [rax] mov dword ptr [rsp - 92], ecx mov rax, qword ptr [rsp + 176] mov ecx, dword ptr [rsp - 52] add ecx, dword ptr [rax] mov dword ptr [rsp - 52], ecx mov rax, qword ptr [rsp + 168] mov ecx, dword ptr [rsp - 84] add ecx, dword ptr [rax] mov dword ptr [rsp - 84], ecx mov rax, qword ptr [rsp + 160] mov ecx, dword ptr [rsp - 80] add ecx, dword ptr [rax] mov dword ptr [rsp - 80], ecx mov rax, qword ptr [rsp + 152] mov ecx, dword ptr [rsp - 76] add ecx, dword ptr [rax] mov dword ptr [rsp - 76], ecx mov rax, qword ptr [rsp + 144] mov ecx, dword ptr [rsp - 44] add ecx, dword ptr [rax] mov dword ptr [rsp - 44], ecx mov rax, qword ptr [rsp + 136] mov ecx, dword ptr [rsp - 68] add ecx, dword ptr [rax] mov dword ptr [rsp - 68], ecx mov rax, qword ptr [rsp + 128] mov ecx, dword ptr [rsp - 64] add ecx, dword ptr [rax] mov dword ptr [rsp - 64], ecx mov rax, qword ptr [rsp + 120] mov ecx, dword ptr [rsp - 60] add ecx, dword ptr [rax] mov dword ptr [rsp - 60], ecx mov rax, qword ptr [rsp + 112] mov ecx, dword ptr [rsp - 56] add ecx, dword ptr [rax] mov dword ptr [rsp - 56], ecx mov rax, qword ptr [rsp + 104] mov ecx, dword ptr [rsp - 48] add ecx, dword ptr [rax] mov dword ptr [rsp - 48], ecx mov rdi, qword ptr [rsp - 8] mov r12, qword ptr [rsp + 224] cmp rdi, r12 mov qword ptr [rsp + 96], rdi jne .LBB12_3 jmp .LBB12_2 .LBB12_40: mov eax, dword ptr [rsp - 88] add eax, dword ptr [rsp - 32] mov ecx, dword ptr [rsp - 72] add ecx, dword ptr [rsp - 100] add ecx, eax mov edx, dword ptr [rsp - 92] add edx, dword ptr [rsp - 96] mov eax, dword ptr [rsp - 52] add eax, edx add eax, ecx mov ecx, dword ptr [rsp - 80] add ecx, dword ptr [rsp - 84] mov edx, dword ptr [rsp - 76] add edx, ecx mov ecx, dword ptr [rsp - 44] add ecx, edx add ecx, eax mov edx, dword ptr [rsp - 64] add edx, dword ptr [rsp - 68] mov eax, dword ptr [rsp - 60] add eax, edx mov edx, dword ptr [rsp - 56] add edx, eax mov eax, dword ptr [rsp - 48] add eax, edx add eax, ecx add rsp, 240 .cfi_def_cfa_offset 56 pop rbx .cfi_def_cfa_offset 48 pop r12 .cfi_def_cfa_offset 40 pop r13 .cfi_def_cfa_offset 32 pop r14 .cfi_def_cfa_offset 24 pop r15 .cfi_def_cfa_offset 16 pop rbp .cfi_def_cfa_offset 8 ret ``` </details> to ```assembly simd_sum_slow: .cfi_startproc xor eax, eax cmp rsi, 16 jb .LBB12_4 shl rsi, 2 pxor xmm0, xmm0 pxor xmm1, xmm1 pxor xmm3, xmm3 pxor xmm2, xmm2 .p2align 4 .LBB12_2: movdqu xmm4, xmmword ptr [rdi] paddd xmm0, xmm4 movdqu xmm4, xmmword ptr [rdi + 16] paddd xmm1, xmm4 movdqu xmm4, xmmword ptr [rdi + 32] paddd xmm3, xmm4 movdqu xmm4, xmmword ptr [rdi + 48] paddd xmm2, xmm4 add rdi, 64 add rsi, -64 cmp rsi, 60 ja .LBB12_2 paddd xmm0, xmm3 paddd xmm1, xmm2 paddd xmm1, xmm0 pshufd xmm0, xmm1, 238 paddd xmm0, xmm1 pshufd xmm1, xmm0, 85 paddd xmm1, xmm0 movd eax, xmm1 .LBB12_4: ret ```
…unk, r=Mark-Simulacrum optimize `slice::Iter::next_chunk` codegen for example (sourced from rust-lang#98326 (comment)) ```rust pub fn simd_sum_slow(arr: &[u32]) -> u32 { const STEP_SIZE: usize = 16; let mut result = [0; STEP_SIZE]; let mut iter = arr.iter(); while let Ok(c) = iter.next_chunk::<STEP_SIZE>() { for (&n, r) in c.iter().zip(result.iter_mut()) { *r += n; } } result.iter().sum() } ``` goes from (znver4) <details> <summary>many asm</summary> ```assembly simd_sum_slow: .cfi_startproc push rbp .cfi_def_cfa_offset 16 push r15 .cfi_def_cfa_offset 24 push r14 .cfi_def_cfa_offset 32 push r13 .cfi_def_cfa_offset 40 push r12 .cfi_def_cfa_offset 48 push rbx .cfi_def_cfa_offset 56 sub rsp, 240 .cfi_def_cfa_offset 296 .cfi_offset rbx, -56 .cfi_offset r12, -48 .cfi_offset r13, -40 .cfi_offset r14, -32 .cfi_offset r15, -24 .cfi_offset rbp, -16 lea r12, [rdi + 4*rsi] mov qword ptr [rsp - 32], 0 mov dword ptr [rsp - 88], 0 mov dword ptr [rsp - 100], 0 mov dword ptr [rsp - 72], 0 mov dword ptr [rsp - 96], 0 mov dword ptr [rsp - 92], 0 mov dword ptr [rsp - 52], 0 mov dword ptr [rsp - 84], 0 mov dword ptr [rsp - 80], 0 mov dword ptr [rsp - 76], 0 mov dword ptr [rsp - 44], 0 mov dword ptr [rsp - 68], 0 mov dword ptr [rsp - 64], 0 mov dword ptr [rsp - 60], 0 mov dword ptr [rsp - 56], 0 mov dword ptr [rsp - 48], 0 mov qword ptr [rsp + 224], r12 cmp rdi, r12 mov qword ptr [rsp + 96], rdi je .LBB12_2 .p2align 4 .LBB12_3: lea r13, [rdi + 4] cmp r13, r12 je .LBB12_4 lea r15, [rdi + 8] cmp r15, r12 je .LBB12_6 lea r14, [rdi + 12] cmp r14, r12 je .LBB12_8 lea rbx, [rdi + 16] cmp rbx, r12 je .LBB12_10 lea r11, [rdi + 20] cmp r11, r12 je .LBB12_12 lea r10, [rdi + 24] cmp r10, r12 je .LBB12_14 lea r9, [rdi + 28] cmp r9, r12 je .LBB12_16 lea r8, [rdi + 32] cmp r8, r12 je .LBB12_18 lea rax, [rdi + 36] cmp rax, r12 je .LBB12_20 mov qword ptr [rsp - 120], rax lea rax, [rdi + 40] mov qword ptr [rsp - 112], rax cmp rax, r12 je .LBB12_22 lea rdx, [rdi + 44] cmp rdx, r12 je .LBB12_24 lea rbp, [rdi + 48] cmp rbp, r12 je .LBB12_26 mov qword ptr [rsp - 40], r9 lea rcx, [rdi + 52] cmp rcx, r12 je .LBB12_30 lea r9, [rdi + 56] cmp r9, r12 je .LBB12_32 lea rax, [rdi + 60] cmp rax, r12 mov qword ptr [rsp - 24], r9 je .LBB12_34 mov qword ptr [rsp + 88], rax lea rax, [rdi + 64] mov qword ptr [rsp - 8], rax mov dword ptr [rsp - 128], 0 mov qword ptr [rsp - 16], rdi mov qword ptr [rsp + 56], r10 lea r10, [rsp + 216] mov qword ptr [rsp + 40], r10 mov qword ptr [rsp + 96], r13 lea rax, [rsp + 208] mov qword ptr [rsp + 32], rax mov qword ptr [rsp + 72], rdx lea rsi, [rsp + 200] mov qword ptr [rsp + 48], rbx lea rbx, [rsp + 192] mov qword ptr [rsp + 80], rcx lea rcx, [rsp + 184] mov qword ptr [rsp + 64], rbp lea rdx, [rsp + 176] mov qword ptr [rsp + 16], r14 lea r14, [rsp + 168] mov qword ptr [rsp + 24], r11 lea r11, [rsp + 160] lea r9, [rsp + 152] lea r12, [rsp + 144] lea r13, [rsp + 136] lea rbp, [rsp + 128] mov qword ptr [rsp], r15 lea r15, [rsp + 120] mov qword ptr [rsp + 8], r8 lea r8, [rsp + 112] lea rdi, [rsp + 104] jmp .LBB12_39 .p2align 4 .LBB12_2: mov qword ptr [rsp - 128], 0 jmp .LBB12_37 .p2align 4 .LBB12_4: mov eax, 1 mov qword ptr [rsp - 128], rax jmp .LBB12_37 .p2align 4 .LBB12_6: mov eax, 2 mov qword ptr [rsp - 128], rax jmp .LBB12_36 .LBB12_8: mov eax, 3 mov qword ptr [rsp - 128], rax jmp .LBB12_36 .LBB12_10: mov eax, 4 mov qword ptr [rsp - 128], rax jmp .LBB12_36 .LBB12_12: mov eax, 5 mov qword ptr [rsp - 128], rax jmp .LBB12_36 .LBB12_14: mov eax, 6 mov qword ptr [rsp - 128], rax jmp .LBB12_36 .LBB12_16: mov eax, 7 mov qword ptr [rsp - 128], rax jmp .LBB12_36 .LBB12_18: mov eax, 8 mov qword ptr [rsp - 128], rax jmp .LBB12_36 .LBB12_20: mov eax, 9 mov qword ptr [rsp - 128], rax jmp .LBB12_36 .LBB12_22: mov eax, 10 mov qword ptr [rsp - 128], rax mov rdi, qword ptr [rsp - 120] jmp .LBB12_36 .LBB12_24: mov eax, 11 mov qword ptr [rsp - 128], rax jmp .LBB12_27 .LBB12_26: mov eax, 12 mov qword ptr [rsp - 128], rax .LBB12_27: mov rsi, qword ptr [rsp - 112] mov rdi, qword ptr [rsp - 120] jmp .LBB12_36 .LBB12_30: mov rcx, rbp mov eax, 13 mov qword ptr [rsp - 128], rax jmp .LBB12_35 .LBB12_32: mov rax, rcx mov rcx, rbp mov esi, 14 mov qword ptr [rsp - 128], rsi jmp .LBB12_35 .LBB12_34: mov rax, rcx mov rcx, rbp mov esi, 15 mov qword ptr [rsp - 128], rsi .LBB12_35: mov rsi, qword ptr [rsp - 112] mov rdi, qword ptr [rsp - 120] mov r9, qword ptr [rsp - 40] .p2align 4 .LBB12_36: mov rbp, r13 .LBB12_37: mov r13, qword ptr [rsp - 24] mov qword ptr [rsp + 88], r13 mov r13, qword ptr [rsp - 128] mov qword ptr [rsp + 216], r13 mov r13b, 1 mov dword ptr [rsp - 128], r13d mov qword ptr [rsp - 24], rax mov qword ptr [rsp + 80], rcx mov qword ptr [rsp + 64], rdx mov qword ptr [rsp + 72], rsi mov qword ptr [rsp - 112], rdi mov qword ptr [rsp - 120], r8 mov qword ptr [rsp + 8], r9 mov qword ptr [rsp - 40], r10 mov qword ptr [rsp + 56], r11 mov qword ptr [rsp + 24], rbx mov qword ptr [rsp + 48], r14 mov qword ptr [rsp + 16], r15 mov qword ptr [rsp], rbp mov eax, 0 mov qword ptr [rsp - 16], rax mov qword ptr [rsp - 8], r12 lea r10, [rsp + 208] mov qword ptr [rsp + 40], r10 lea rax, [rsp + 200] mov qword ptr [rsp + 32], rax lea rsi, [rsp + 192] lea rbx, [rsp + 184] lea rcx, [rsp + 176] lea rdx, [rsp + 168] lea r14, [rsp + 160] lea r11, [rsp + 152] lea r9, [rsp + 144] lea r12, [rsp + 136] lea r13, [rsp + 128] lea rbp, [rsp + 120] lea r15, [rsp + 112] lea r8, [rsp + 104] lea rdi, [rsp + 232] .LBB12_39: mov r10, qword ptr [rsp + 96] mov rax, qword ptr [rsp + 40] mov qword ptr [rax], r10 mov rax, qword ptr [rsp + 32] mov r10, qword ptr [rsp] mov qword ptr [rax], r10 mov rax, qword ptr [rsp + 16] mov qword ptr [rsi], rax mov rax, qword ptr [rsp + 48] mov qword ptr [rbx], rax mov rax, qword ptr [rsp + 24] mov qword ptr [rcx], rax mov rax, qword ptr [rsp + 56] mov qword ptr [rdx], rax mov rax, qword ptr [rsp - 40] mov qword ptr [r14], rax mov rax, qword ptr [rsp + 8] mov qword ptr [r11], rax mov rax, qword ptr [rsp - 120] mov qword ptr [r9], rax mov rax, qword ptr [rsp - 112] mov qword ptr [r12], rax mov rax, qword ptr [rsp + 72] mov qword ptr [r13], rax mov rax, qword ptr [rsp + 64] mov qword ptr [rbp], rax mov rax, qword ptr [rsp + 80] mov qword ptr [r15], rax mov rax, qword ptr [rsp - 24] mov qword ptr [r8], rax mov rax, qword ptr [rsp + 88] mov qword ptr [rdi], rax cmp byte ptr [rsp - 128], 0 jne .LBB12_40 mov rax, qword ptr [rsp - 32] mov rcx, qword ptr [rsp - 16] add eax, dword ptr [rcx] mov qword ptr [rsp - 32], rax mov rax, qword ptr [rsp + 216] mov ecx, dword ptr [rsp - 88] add ecx, dword ptr [rax] mov dword ptr [rsp - 88], ecx mov rax, qword ptr [rsp + 208] mov ecx, dword ptr [rsp - 100] add ecx, dword ptr [rax] mov dword ptr [rsp - 100], ecx mov rax, qword ptr [rsp + 200] mov ecx, dword ptr [rsp - 72] add ecx, dword ptr [rax] mov dword ptr [rsp - 72], ecx mov rax, qword ptr [rsp + 192] mov ecx, dword ptr [rsp - 96] add ecx, dword ptr [rax] mov dword ptr [rsp - 96], ecx mov rax, qword ptr [rsp + 184] mov ecx, dword ptr [rsp - 92] add ecx, dword ptr [rax] mov dword ptr [rsp - 92], ecx mov rax, qword ptr [rsp + 176] mov ecx, dword ptr [rsp - 52] add ecx, dword ptr [rax] mov dword ptr [rsp - 52], ecx mov rax, qword ptr [rsp + 168] mov ecx, dword ptr [rsp - 84] add ecx, dword ptr [rax] mov dword ptr [rsp - 84], ecx mov rax, qword ptr [rsp + 160] mov ecx, dword ptr [rsp - 80] add ecx, dword ptr [rax] mov dword ptr [rsp - 80], ecx mov rax, qword ptr [rsp + 152] mov ecx, dword ptr [rsp - 76] add ecx, dword ptr [rax] mov dword ptr [rsp - 76], ecx mov rax, qword ptr [rsp + 144] mov ecx, dword ptr [rsp - 44] add ecx, dword ptr [rax] mov dword ptr [rsp - 44], ecx mov rax, qword ptr [rsp + 136] mov ecx, dword ptr [rsp - 68] add ecx, dword ptr [rax] mov dword ptr [rsp - 68], ecx mov rax, qword ptr [rsp + 128] mov ecx, dword ptr [rsp - 64] add ecx, dword ptr [rax] mov dword ptr [rsp - 64], ecx mov rax, qword ptr [rsp + 120] mov ecx, dword ptr [rsp - 60] add ecx, dword ptr [rax] mov dword ptr [rsp - 60], ecx mov rax, qword ptr [rsp + 112] mov ecx, dword ptr [rsp - 56] add ecx, dword ptr [rax] mov dword ptr [rsp - 56], ecx mov rax, qword ptr [rsp + 104] mov ecx, dword ptr [rsp - 48] add ecx, dword ptr [rax] mov dword ptr [rsp - 48], ecx mov rdi, qword ptr [rsp - 8] mov r12, qword ptr [rsp + 224] cmp rdi, r12 mov qword ptr [rsp + 96], rdi jne .LBB12_3 jmp .LBB12_2 .LBB12_40: mov eax, dword ptr [rsp - 88] add eax, dword ptr [rsp - 32] mov ecx, dword ptr [rsp - 72] add ecx, dword ptr [rsp - 100] add ecx, eax mov edx, dword ptr [rsp - 92] add edx, dword ptr [rsp - 96] mov eax, dword ptr [rsp - 52] add eax, edx add eax, ecx mov ecx, dword ptr [rsp - 80] add ecx, dword ptr [rsp - 84] mov edx, dword ptr [rsp - 76] add edx, ecx mov ecx, dword ptr [rsp - 44] add ecx, edx add ecx, eax mov edx, dword ptr [rsp - 64] add edx, dword ptr [rsp - 68] mov eax, dword ptr [rsp - 60] add eax, edx mov edx, dword ptr [rsp - 56] add edx, eax mov eax, dword ptr [rsp - 48] add eax, edx add eax, ecx add rsp, 240 .cfi_def_cfa_offset 56 pop rbx .cfi_def_cfa_offset 48 pop r12 .cfi_def_cfa_offset 40 pop r13 .cfi_def_cfa_offset 32 pop r14 .cfi_def_cfa_offset 24 pop r15 .cfi_def_cfa_offset 16 pop rbp .cfi_def_cfa_offset 8 ret ``` </details> to ```assembly simd_sum_slow: .cfi_startproc xor eax, eax cmp rsi, 16 jb .LBB12_4 shl rsi, 2 pxor xmm0, xmm0 pxor xmm1, xmm1 pxor xmm3, xmm3 pxor xmm2, xmm2 .p2align 4 .LBB12_2: movdqu xmm4, xmmword ptr [rdi] paddd xmm0, xmm4 movdqu xmm4, xmmword ptr [rdi + 16] paddd xmm1, xmm4 movdqu xmm4, xmmword ptr [rdi + 32] paddd xmm3, xmm4 movdqu xmm4, xmmword ptr [rdi + 48] paddd xmm2, xmm4 add rdi, 64 add rsi, -64 cmp rsi, 60 ja .LBB12_2 paddd xmm0, xmm3 paddd xmm1, xmm2 paddd xmm1, xmm0 pshufd xmm0, xmm1, 238 paddd xmm0, xmm1 pshufd xmm1, xmm0, 85 paddd xmm1, xmm0 movd eax, xmm1 .LBB12_4: ret ```
…unk, r=Mark-Simulacrum optimize `slice::Iter::next_chunk` codegen for example (sourced from rust-lang#98326 (comment)) ```rust pub fn simd_sum_slow(arr: &[u32]) -> u32 { const STEP_SIZE: usize = 16; let mut result = [0; STEP_SIZE]; let mut iter = arr.iter(); while let Ok(c) = iter.next_chunk::<STEP_SIZE>() { for (&n, r) in c.iter().zip(result.iter_mut()) { *r += n; } } result.iter().sum() } ``` goes from (znver4) <details> <summary>many asm</summary> ```assembly simd_sum_slow: .cfi_startproc push rbp .cfi_def_cfa_offset 16 push r15 .cfi_def_cfa_offset 24 push r14 .cfi_def_cfa_offset 32 push r13 .cfi_def_cfa_offset 40 push r12 .cfi_def_cfa_offset 48 push rbx .cfi_def_cfa_offset 56 sub rsp, 240 .cfi_def_cfa_offset 296 .cfi_offset rbx, -56 .cfi_offset r12, -48 .cfi_offset r13, -40 .cfi_offset r14, -32 .cfi_offset r15, -24 .cfi_offset rbp, -16 lea r12, [rdi + 4*rsi] mov qword ptr [rsp - 32], 0 mov dword ptr [rsp - 88], 0 mov dword ptr [rsp - 100], 0 mov dword ptr [rsp - 72], 0 mov dword ptr [rsp - 96], 0 mov dword ptr [rsp - 92], 0 mov dword ptr [rsp - 52], 0 mov dword ptr [rsp - 84], 0 mov dword ptr [rsp - 80], 0 mov dword ptr [rsp - 76], 0 mov dword ptr [rsp - 44], 0 mov dword ptr [rsp - 68], 0 mov dword ptr [rsp - 64], 0 mov dword ptr [rsp - 60], 0 mov dword ptr [rsp - 56], 0 mov dword ptr [rsp - 48], 0 mov qword ptr [rsp + 224], r12 cmp rdi, r12 mov qword ptr [rsp + 96], rdi je .LBB12_2 .p2align 4 .LBB12_3: lea r13, [rdi + 4] cmp r13, r12 je .LBB12_4 lea r15, [rdi + 8] cmp r15, r12 je .LBB12_6 lea r14, [rdi + 12] cmp r14, r12 je .LBB12_8 lea rbx, [rdi + 16] cmp rbx, r12 je .LBB12_10 lea r11, [rdi + 20] cmp r11, r12 je .LBB12_12 lea r10, [rdi + 24] cmp r10, r12 je .LBB12_14 lea r9, [rdi + 28] cmp r9, r12 je .LBB12_16 lea r8, [rdi + 32] cmp r8, r12 je .LBB12_18 lea rax, [rdi + 36] cmp rax, r12 je .LBB12_20 mov qword ptr [rsp - 120], rax lea rax, [rdi + 40] mov qword ptr [rsp - 112], rax cmp rax, r12 je .LBB12_22 lea rdx, [rdi + 44] cmp rdx, r12 je .LBB12_24 lea rbp, [rdi + 48] cmp rbp, r12 je .LBB12_26 mov qword ptr [rsp - 40], r9 lea rcx, [rdi + 52] cmp rcx, r12 je .LBB12_30 lea r9, [rdi + 56] cmp r9, r12 je .LBB12_32 lea rax, [rdi + 60] cmp rax, r12 mov qword ptr [rsp - 24], r9 je .LBB12_34 mov qword ptr [rsp + 88], rax lea rax, [rdi + 64] mov qword ptr [rsp - 8], rax mov dword ptr [rsp - 128], 0 mov qword ptr [rsp - 16], rdi mov qword ptr [rsp + 56], r10 lea r10, [rsp + 216] mov qword ptr [rsp + 40], r10 mov qword ptr [rsp + 96], r13 lea rax, [rsp + 208] mov qword ptr [rsp + 32], rax mov qword ptr [rsp + 72], rdx lea rsi, [rsp + 200] mov qword ptr [rsp + 48], rbx lea rbx, [rsp + 192] mov qword ptr [rsp + 80], rcx lea rcx, [rsp + 184] mov qword ptr [rsp + 64], rbp lea rdx, [rsp + 176] mov qword ptr [rsp + 16], r14 lea r14, [rsp + 168] mov qword ptr [rsp + 24], r11 lea r11, [rsp + 160] lea r9, [rsp + 152] lea r12, [rsp + 144] lea r13, [rsp + 136] lea rbp, [rsp + 128] mov qword ptr [rsp], r15 lea r15, [rsp + 120] mov qword ptr [rsp + 8], r8 lea r8, [rsp + 112] lea rdi, [rsp + 104] jmp .LBB12_39 .p2align 4 .LBB12_2: mov qword ptr [rsp - 128], 0 jmp .LBB12_37 .p2align 4 .LBB12_4: mov eax, 1 mov qword ptr [rsp - 128], rax jmp .LBB12_37 .p2align 4 .LBB12_6: mov eax, 2 mov qword ptr [rsp - 128], rax jmp .LBB12_36 .LBB12_8: mov eax, 3 mov qword ptr [rsp - 128], rax jmp .LBB12_36 .LBB12_10: mov eax, 4 mov qword ptr [rsp - 128], rax jmp .LBB12_36 .LBB12_12: mov eax, 5 mov qword ptr [rsp - 128], rax jmp .LBB12_36 .LBB12_14: mov eax, 6 mov qword ptr [rsp - 128], rax jmp .LBB12_36 .LBB12_16: mov eax, 7 mov qword ptr [rsp - 128], rax jmp .LBB12_36 .LBB12_18: mov eax, 8 mov qword ptr [rsp - 128], rax jmp .LBB12_36 .LBB12_20: mov eax, 9 mov qword ptr [rsp - 128], rax jmp .LBB12_36 .LBB12_22: mov eax, 10 mov qword ptr [rsp - 128], rax mov rdi, qword ptr [rsp - 120] jmp .LBB12_36 .LBB12_24: mov eax, 11 mov qword ptr [rsp - 128], rax jmp .LBB12_27 .LBB12_26: mov eax, 12 mov qword ptr [rsp - 128], rax .LBB12_27: mov rsi, qword ptr [rsp - 112] mov rdi, qword ptr [rsp - 120] jmp .LBB12_36 .LBB12_30: mov rcx, rbp mov eax, 13 mov qword ptr [rsp - 128], rax jmp .LBB12_35 .LBB12_32: mov rax, rcx mov rcx, rbp mov esi, 14 mov qword ptr [rsp - 128], rsi jmp .LBB12_35 .LBB12_34: mov rax, rcx mov rcx, rbp mov esi, 15 mov qword ptr [rsp - 128], rsi .LBB12_35: mov rsi, qword ptr [rsp - 112] mov rdi, qword ptr [rsp - 120] mov r9, qword ptr [rsp - 40] .p2align 4 .LBB12_36: mov rbp, r13 .LBB12_37: mov r13, qword ptr [rsp - 24] mov qword ptr [rsp + 88], r13 mov r13, qword ptr [rsp - 128] mov qword ptr [rsp + 216], r13 mov r13b, 1 mov dword ptr [rsp - 128], r13d mov qword ptr [rsp - 24], rax mov qword ptr [rsp + 80], rcx mov qword ptr [rsp + 64], rdx mov qword ptr [rsp + 72], rsi mov qword ptr [rsp - 112], rdi mov qword ptr [rsp - 120], r8 mov qword ptr [rsp + 8], r9 mov qword ptr [rsp - 40], r10 mov qword ptr [rsp + 56], r11 mov qword ptr [rsp + 24], rbx mov qword ptr [rsp + 48], r14 mov qword ptr [rsp + 16], r15 mov qword ptr [rsp], rbp mov eax, 0 mov qword ptr [rsp - 16], rax mov qword ptr [rsp - 8], r12 lea r10, [rsp + 208] mov qword ptr [rsp + 40], r10 lea rax, [rsp + 200] mov qword ptr [rsp + 32], rax lea rsi, [rsp + 192] lea rbx, [rsp + 184] lea rcx, [rsp + 176] lea rdx, [rsp + 168] lea r14, [rsp + 160] lea r11, [rsp + 152] lea r9, [rsp + 144] lea r12, [rsp + 136] lea r13, [rsp + 128] lea rbp, [rsp + 120] lea r15, [rsp + 112] lea r8, [rsp + 104] lea rdi, [rsp + 232] .LBB12_39: mov r10, qword ptr [rsp + 96] mov rax, qword ptr [rsp + 40] mov qword ptr [rax], r10 mov rax, qword ptr [rsp + 32] mov r10, qword ptr [rsp] mov qword ptr [rax], r10 mov rax, qword ptr [rsp + 16] mov qword ptr [rsi], rax mov rax, qword ptr [rsp + 48] mov qword ptr [rbx], rax mov rax, qword ptr [rsp + 24] mov qword ptr [rcx], rax mov rax, qword ptr [rsp + 56] mov qword ptr [rdx], rax mov rax, qword ptr [rsp - 40] mov qword ptr [r14], rax mov rax, qword ptr [rsp + 8] mov qword ptr [r11], rax mov rax, qword ptr [rsp - 120] mov qword ptr [r9], rax mov rax, qword ptr [rsp - 112] mov qword ptr [r12], rax mov rax, qword ptr [rsp + 72] mov qword ptr [r13], rax mov rax, qword ptr [rsp + 64] mov qword ptr [rbp], rax mov rax, qword ptr [rsp + 80] mov qword ptr [r15], rax mov rax, qword ptr [rsp - 24] mov qword ptr [r8], rax mov rax, qword ptr [rsp + 88] mov qword ptr [rdi], rax cmp byte ptr [rsp - 128], 0 jne .LBB12_40 mov rax, qword ptr [rsp - 32] mov rcx, qword ptr [rsp - 16] add eax, dword ptr [rcx] mov qword ptr [rsp - 32], rax mov rax, qword ptr [rsp + 216] mov ecx, dword ptr [rsp - 88] add ecx, dword ptr [rax] mov dword ptr [rsp - 88], ecx mov rax, qword ptr [rsp + 208] mov ecx, dword ptr [rsp - 100] add ecx, dword ptr [rax] mov dword ptr [rsp - 100], ecx mov rax, qword ptr [rsp + 200] mov ecx, dword ptr [rsp - 72] add ecx, dword ptr [rax] mov dword ptr [rsp - 72], ecx mov rax, qword ptr [rsp + 192] mov ecx, dword ptr [rsp - 96] add ecx, dword ptr [rax] mov dword ptr [rsp - 96], ecx mov rax, qword ptr [rsp + 184] mov ecx, dword ptr [rsp - 92] add ecx, dword ptr [rax] mov dword ptr [rsp - 92], ecx mov rax, qword ptr [rsp + 176] mov ecx, dword ptr [rsp - 52] add ecx, dword ptr [rax] mov dword ptr [rsp - 52], ecx mov rax, qword ptr [rsp + 168] mov ecx, dword ptr [rsp - 84] add ecx, dword ptr [rax] mov dword ptr [rsp - 84], ecx mov rax, qword ptr [rsp + 160] mov ecx, dword ptr [rsp - 80] add ecx, dword ptr [rax] mov dword ptr [rsp - 80], ecx mov rax, qword ptr [rsp + 152] mov ecx, dword ptr [rsp - 76] add ecx, dword ptr [rax] mov dword ptr [rsp - 76], ecx mov rax, qword ptr [rsp + 144] mov ecx, dword ptr [rsp - 44] add ecx, dword ptr [rax] mov dword ptr [rsp - 44], ecx mov rax, qword ptr [rsp + 136] mov ecx, dword ptr [rsp - 68] add ecx, dword ptr [rax] mov dword ptr [rsp - 68], ecx mov rax, qword ptr [rsp + 128] mov ecx, dword ptr [rsp - 64] add ecx, dword ptr [rax] mov dword ptr [rsp - 64], ecx mov rax, qword ptr [rsp + 120] mov ecx, dword ptr [rsp - 60] add ecx, dword ptr [rax] mov dword ptr [rsp - 60], ecx mov rax, qword ptr [rsp + 112] mov ecx, dword ptr [rsp - 56] add ecx, dword ptr [rax] mov dword ptr [rsp - 56], ecx mov rax, qword ptr [rsp + 104] mov ecx, dword ptr [rsp - 48] add ecx, dword ptr [rax] mov dword ptr [rsp - 48], ecx mov rdi, qword ptr [rsp - 8] mov r12, qword ptr [rsp + 224] cmp rdi, r12 mov qword ptr [rsp + 96], rdi jne .LBB12_3 jmp .LBB12_2 .LBB12_40: mov eax, dword ptr [rsp - 88] add eax, dword ptr [rsp - 32] mov ecx, dword ptr [rsp - 72] add ecx, dword ptr [rsp - 100] add ecx, eax mov edx, dword ptr [rsp - 92] add edx, dword ptr [rsp - 96] mov eax, dword ptr [rsp - 52] add eax, edx add eax, ecx mov ecx, dword ptr [rsp - 80] add ecx, dword ptr [rsp - 84] mov edx, dword ptr [rsp - 76] add edx, ecx mov ecx, dword ptr [rsp - 44] add ecx, edx add ecx, eax mov edx, dword ptr [rsp - 64] add edx, dword ptr [rsp - 68] mov eax, dword ptr [rsp - 60] add eax, edx mov edx, dword ptr [rsp - 56] add edx, eax mov eax, dword ptr [rsp - 48] add eax, edx add eax, ecx add rsp, 240 .cfi_def_cfa_offset 56 pop rbx .cfi_def_cfa_offset 48 pop r12 .cfi_def_cfa_offset 40 pop r13 .cfi_def_cfa_offset 32 pop r14 .cfi_def_cfa_offset 24 pop r15 .cfi_def_cfa_offset 16 pop rbp .cfi_def_cfa_offset 8 ret ``` </details> to ```assembly simd_sum_slow: .cfi_startproc xor eax, eax cmp rsi, 16 jb .LBB12_4 shl rsi, 2 pxor xmm0, xmm0 pxor xmm1, xmm1 pxor xmm3, xmm3 pxor xmm2, xmm2 .p2align 4 .LBB12_2: movdqu xmm4, xmmword ptr [rdi] paddd xmm0, xmm4 movdqu xmm4, xmmword ptr [rdi + 16] paddd xmm1, xmm4 movdqu xmm4, xmmword ptr [rdi + 32] paddd xmm3, xmm4 movdqu xmm4, xmmword ptr [rdi + 48] paddd xmm2, xmm4 add rdi, 64 add rsi, -64 cmp rsi, 60 ja .LBB12_2 paddd xmm0, xmm3 paddd xmm1, xmm2 paddd xmm1, xmm0 pshufd xmm0, xmm1, 238 paddd xmm0, xmm1 pshufd xmm1, xmm0, 85 paddd xmm1, xmm0 movd eax, xmm1 .LBB12_4: ret ```
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Rollup of 19 pull requests Successful merges: - #148048 (Stabilize `maybe_uninit_write_slice`) - #148641 (Add a diagnostic attribute for special casing const bound errors for non-const impls) - #148765 (std: split up the `thread` module) - #149074 (Add Command::get_env_clear) - #149097 (num: Implement `uint_gather_scatter_bits` feature for unsigned integers) - #149131 (optimize `slice::Iter::next_chunk`) - #149190 (Forbid `CHECK: br` and `CHECK-NOT: br` in codegen tests (suggest `br {{.*}}` instead)) - #149239 (clarify float min/max behavios for NaNs and signed zeros) - #149243 (Fix typo and clarify bootstrap change tracker entry) - #149270 (implement `Iterator::{exactly_one, collect_array}`) - #149295 (Suggest _bytes versions of endian-converting methods) - #149301 (Motor OS: make decode_error_kind more comprehensive) - #149306 (bootstrap: Miri now handles jemalloc like everything else) - #149325 (rustdoc: add regression test for #140968) - #149332 (fix rustdoc search says “Consider searching for "null" instead.” #149324) - #149349 (Fix typo in comment.) - #149353 (Tidying up UI tests [3/N]) - #149355 (Document that `build.description` affects symbol mangling and crate IDs) - #149360 (Enable CI download for windows-gnullvm) r? `@ghost` `@rustbot` modify labels: rollup
Rollup of 12 pull requests Successful merges: - #147115 (More robust stack protector testing) - #148048 (Stabilize `maybe_uninit_write_slice`) - #148641 (Add a diagnostic attribute for special casing const bound errors for non-const impls) - #149074 (Add Command::get_env_clear) - #149097 (num: Implement `uint_gather_scatter_bits` feature for unsigned integers) - #149131 (optimize `slice::Iter::next_chunk`) - #149190 (Forbid `CHECK: br` and `CHECK-NOT: br` in codegen tests (suggest `br {{.*}}` instead)) - #149239 (clarify float min/max behavios for NaNs and signed zeros) - #149243 (Fix typo and clarify bootstrap change tracker entry) - #149301 (Motor OS: make decode_error_kind more comprehensive) - #149306 (bootstrap: Miri now handles jemalloc like everything else) - #149325 (rustdoc: add regression test for #140968) r? `@ghost` `@rustbot` modify labels: rollup
Rollup merge of #149131 - bend-n:optimize_slice_iter_next_chunk, r=Mark-Simulacrum optimize `slice::Iter::next_chunk` codegen for example (sourced from #98326 (comment)) ```rust pub fn simd_sum_slow(arr: &[u32]) -> u32 { const STEP_SIZE: usize = 16; let mut result = [0; STEP_SIZE]; let mut iter = arr.iter(); while let Ok(c) = iter.next_chunk::<STEP_SIZE>() { for (&n, r) in c.iter().zip(result.iter_mut()) { *r += n; } } result.iter().sum() } ``` goes from (znver4) <details> <summary>many asm</summary> ```assembly simd_sum_slow: .cfi_startproc push rbp .cfi_def_cfa_offset 16 push r15 .cfi_def_cfa_offset 24 push r14 .cfi_def_cfa_offset 32 push r13 .cfi_def_cfa_offset 40 push r12 .cfi_def_cfa_offset 48 push rbx .cfi_def_cfa_offset 56 sub rsp, 240 .cfi_def_cfa_offset 296 .cfi_offset rbx, -56 .cfi_offset r12, -48 .cfi_offset r13, -40 .cfi_offset r14, -32 .cfi_offset r15, -24 .cfi_offset rbp, -16 lea r12, [rdi + 4*rsi] mov qword ptr [rsp - 32], 0 mov dword ptr [rsp - 88], 0 mov dword ptr [rsp - 100], 0 mov dword ptr [rsp - 72], 0 mov dword ptr [rsp - 96], 0 mov dword ptr [rsp - 92], 0 mov dword ptr [rsp - 52], 0 mov dword ptr [rsp - 84], 0 mov dword ptr [rsp - 80], 0 mov dword ptr [rsp - 76], 0 mov dword ptr [rsp - 44], 0 mov dword ptr [rsp - 68], 0 mov dword ptr [rsp - 64], 0 mov dword ptr [rsp - 60], 0 mov dword ptr [rsp - 56], 0 mov dword ptr [rsp - 48], 0 mov qword ptr [rsp + 224], r12 cmp rdi, r12 mov qword ptr [rsp + 96], rdi je .LBB12_2 .p2align 4 .LBB12_3: lea r13, [rdi + 4] cmp r13, r12 je .LBB12_4 lea r15, [rdi + 8] cmp r15, r12 je .LBB12_6 lea r14, [rdi + 12] cmp r14, r12 je .LBB12_8 lea rbx, [rdi + 16] cmp rbx, r12 je .LBB12_10 lea r11, [rdi + 20] cmp r11, r12 je .LBB12_12 lea r10, [rdi + 24] cmp r10, r12 je .LBB12_14 lea r9, [rdi + 28] cmp r9, r12 je .LBB12_16 lea r8, [rdi + 32] cmp r8, r12 je .LBB12_18 lea rax, [rdi + 36] cmp rax, r12 je .LBB12_20 mov qword ptr [rsp - 120], rax lea rax, [rdi + 40] mov qword ptr [rsp - 112], rax cmp rax, r12 je .LBB12_22 lea rdx, [rdi + 44] cmp rdx, r12 je .LBB12_24 lea rbp, [rdi + 48] cmp rbp, r12 je .LBB12_26 mov qword ptr [rsp - 40], r9 lea rcx, [rdi + 52] cmp rcx, r12 je .LBB12_30 lea r9, [rdi + 56] cmp r9, r12 je .LBB12_32 lea rax, [rdi + 60] cmp rax, r12 mov qword ptr [rsp - 24], r9 je .LBB12_34 mov qword ptr [rsp + 88], rax lea rax, [rdi + 64] mov qword ptr [rsp - 8], rax mov dword ptr [rsp - 128], 0 mov qword ptr [rsp - 16], rdi mov qword ptr [rsp + 56], r10 lea r10, [rsp + 216] mov qword ptr [rsp + 40], r10 mov qword ptr [rsp + 96], r13 lea rax, [rsp + 208] mov qword ptr [rsp + 32], rax mov qword ptr [rsp + 72], rdx lea rsi, [rsp + 200] mov qword ptr [rsp + 48], rbx lea rbx, [rsp + 192] mov qword ptr [rsp + 80], rcx lea rcx, [rsp + 184] mov qword ptr [rsp + 64], rbp lea rdx, [rsp + 176] mov qword ptr [rsp + 16], r14 lea r14, [rsp + 168] mov qword ptr [rsp + 24], r11 lea r11, [rsp + 160] lea r9, [rsp + 152] lea r12, [rsp + 144] lea r13, [rsp + 136] lea rbp, [rsp + 128] mov qword ptr [rsp], r15 lea r15, [rsp + 120] mov qword ptr [rsp + 8], r8 lea r8, [rsp + 112] lea rdi, [rsp + 104] jmp .LBB12_39 .p2align 4 .LBB12_2: mov qword ptr [rsp - 128], 0 jmp .LBB12_37 .p2align 4 .LBB12_4: mov eax, 1 mov qword ptr [rsp - 128], rax jmp .LBB12_37 .p2align 4 .LBB12_6: mov eax, 2 mov qword ptr [rsp - 128], rax jmp .LBB12_36 .LBB12_8: mov eax, 3 mov qword ptr [rsp - 128], rax jmp .LBB12_36 .LBB12_10: mov eax, 4 mov qword ptr [rsp - 128], rax jmp .LBB12_36 .LBB12_12: mov eax, 5 mov qword ptr [rsp - 128], rax jmp .LBB12_36 .LBB12_14: mov eax, 6 mov qword ptr [rsp - 128], rax jmp .LBB12_36 .LBB12_16: mov eax, 7 mov qword ptr [rsp - 128], rax jmp .LBB12_36 .LBB12_18: mov eax, 8 mov qword ptr [rsp - 128], rax jmp .LBB12_36 .LBB12_20: mov eax, 9 mov qword ptr [rsp - 128], rax jmp .LBB12_36 .LBB12_22: mov eax, 10 mov qword ptr [rsp - 128], rax mov rdi, qword ptr [rsp - 120] jmp .LBB12_36 .LBB12_24: mov eax, 11 mov qword ptr [rsp - 128], rax jmp .LBB12_27 .LBB12_26: mov eax, 12 mov qword ptr [rsp - 128], rax .LBB12_27: mov rsi, qword ptr [rsp - 112] mov rdi, qword ptr [rsp - 120] jmp .LBB12_36 .LBB12_30: mov rcx, rbp mov eax, 13 mov qword ptr [rsp - 128], rax jmp .LBB12_35 .LBB12_32: mov rax, rcx mov rcx, rbp mov esi, 14 mov qword ptr [rsp - 128], rsi jmp .LBB12_35 .LBB12_34: mov rax, rcx mov rcx, rbp mov esi, 15 mov qword ptr [rsp - 128], rsi .LBB12_35: mov rsi, qword ptr [rsp - 112] mov rdi, qword ptr [rsp - 120] mov r9, qword ptr [rsp - 40] .p2align 4 .LBB12_36: mov rbp, r13 .LBB12_37: mov r13, qword ptr [rsp - 24] mov qword ptr [rsp + 88], r13 mov r13, qword ptr [rsp - 128] mov qword ptr [rsp + 216], r13 mov r13b, 1 mov dword ptr [rsp - 128], r13d mov qword ptr [rsp - 24], rax mov qword ptr [rsp + 80], rcx mov qword ptr [rsp + 64], rdx mov qword ptr [rsp + 72], rsi mov qword ptr [rsp - 112], rdi mov qword ptr [rsp - 120], r8 mov qword ptr [rsp + 8], r9 mov qword ptr [rsp - 40], r10 mov qword ptr [rsp + 56], r11 mov qword ptr [rsp + 24], rbx mov qword ptr [rsp + 48], r14 mov qword ptr [rsp + 16], r15 mov qword ptr [rsp], rbp mov eax, 0 mov qword ptr [rsp - 16], rax mov qword ptr [rsp - 8], r12 lea r10, [rsp + 208] mov qword ptr [rsp + 40], r10 lea rax, [rsp + 200] mov qword ptr [rsp + 32], rax lea rsi, [rsp + 192] lea rbx, [rsp + 184] lea rcx, [rsp + 176] lea rdx, [rsp + 168] lea r14, [rsp + 160] lea r11, [rsp + 152] lea r9, [rsp + 144] lea r12, [rsp + 136] lea r13, [rsp + 128] lea rbp, [rsp + 120] lea r15, [rsp + 112] lea r8, [rsp + 104] lea rdi, [rsp + 232] .LBB12_39: mov r10, qword ptr [rsp + 96] mov rax, qword ptr [rsp + 40] mov qword ptr [rax], r10 mov rax, qword ptr [rsp + 32] mov r10, qword ptr [rsp] mov qword ptr [rax], r10 mov rax, qword ptr [rsp + 16] mov qword ptr [rsi], rax mov rax, qword ptr [rsp + 48] mov qword ptr [rbx], rax mov rax, qword ptr [rsp + 24] mov qword ptr [rcx], rax mov rax, qword ptr [rsp + 56] mov qword ptr [rdx], rax mov rax, qword ptr [rsp - 40] mov qword ptr [r14], rax mov rax, qword ptr [rsp + 8] mov qword ptr [r11], rax mov rax, qword ptr [rsp - 120] mov qword ptr [r9], rax mov rax, qword ptr [rsp - 112] mov qword ptr [r12], rax mov rax, qword ptr [rsp + 72] mov qword ptr [r13], rax mov rax, qword ptr [rsp + 64] mov qword ptr [rbp], rax mov rax, qword ptr [rsp + 80] mov qword ptr [r15], rax mov rax, qword ptr [rsp - 24] mov qword ptr [r8], rax mov rax, qword ptr [rsp + 88] mov qword ptr [rdi], rax cmp byte ptr [rsp - 128], 0 jne .LBB12_40 mov rax, qword ptr [rsp - 32] mov rcx, qword ptr [rsp - 16] add eax, dword ptr [rcx] mov qword ptr [rsp - 32], rax mov rax, qword ptr [rsp + 216] mov ecx, dword ptr [rsp - 88] add ecx, dword ptr [rax] mov dword ptr [rsp - 88], ecx mov rax, qword ptr [rsp + 208] mov ecx, dword ptr [rsp - 100] add ecx, dword ptr [rax] mov dword ptr [rsp - 100], ecx mov rax, qword ptr [rsp + 200] mov ecx, dword ptr [rsp - 72] add ecx, dword ptr [rax] mov dword ptr [rsp - 72], ecx mov rax, qword ptr [rsp + 192] mov ecx, dword ptr [rsp - 96] add ecx, dword ptr [rax] mov dword ptr [rsp - 96], ecx mov rax, qword ptr [rsp + 184] mov ecx, dword ptr [rsp - 92] add ecx, dword ptr [rax] mov dword ptr [rsp - 92], ecx mov rax, qword ptr [rsp + 176] mov ecx, dword ptr [rsp - 52] add ecx, dword ptr [rax] mov dword ptr [rsp - 52], ecx mov rax, qword ptr [rsp + 168] mov ecx, dword ptr [rsp - 84] add ecx, dword ptr [rax] mov dword ptr [rsp - 84], ecx mov rax, qword ptr [rsp + 160] mov ecx, dword ptr [rsp - 80] add ecx, dword ptr [rax] mov dword ptr [rsp - 80], ecx mov rax, qword ptr [rsp + 152] mov ecx, dword ptr [rsp - 76] add ecx, dword ptr [rax] mov dword ptr [rsp - 76], ecx mov rax, qword ptr [rsp + 144] mov ecx, dword ptr [rsp - 44] add ecx, dword ptr [rax] mov dword ptr [rsp - 44], ecx mov rax, qword ptr [rsp + 136] mov ecx, dword ptr [rsp - 68] add ecx, dword ptr [rax] mov dword ptr [rsp - 68], ecx mov rax, qword ptr [rsp + 128] mov ecx, dword ptr [rsp - 64] add ecx, dword ptr [rax] mov dword ptr [rsp - 64], ecx mov rax, qword ptr [rsp + 120] mov ecx, dword ptr [rsp - 60] add ecx, dword ptr [rax] mov dword ptr [rsp - 60], ecx mov rax, qword ptr [rsp + 112] mov ecx, dword ptr [rsp - 56] add ecx, dword ptr [rax] mov dword ptr [rsp - 56], ecx mov rax, qword ptr [rsp + 104] mov ecx, dword ptr [rsp - 48] add ecx, dword ptr [rax] mov dword ptr [rsp - 48], ecx mov rdi, qword ptr [rsp - 8] mov r12, qword ptr [rsp + 224] cmp rdi, r12 mov qword ptr [rsp + 96], rdi jne .LBB12_3 jmp .LBB12_2 .LBB12_40: mov eax, dword ptr [rsp - 88] add eax, dword ptr [rsp - 32] mov ecx, dword ptr [rsp - 72] add ecx, dword ptr [rsp - 100] add ecx, eax mov edx, dword ptr [rsp - 92] add edx, dword ptr [rsp - 96] mov eax, dword ptr [rsp - 52] add eax, edx add eax, ecx mov ecx, dword ptr [rsp - 80] add ecx, dword ptr [rsp - 84] mov edx, dword ptr [rsp - 76] add edx, ecx mov ecx, dword ptr [rsp - 44] add ecx, edx add ecx, eax mov edx, dword ptr [rsp - 64] add edx, dword ptr [rsp - 68] mov eax, dword ptr [rsp - 60] add eax, edx mov edx, dword ptr [rsp - 56] add edx, eax mov eax, dword ptr [rsp - 48] add eax, edx add eax, ecx add rsp, 240 .cfi_def_cfa_offset 56 pop rbx .cfi_def_cfa_offset 48 pop r12 .cfi_def_cfa_offset 40 pop r13 .cfi_def_cfa_offset 32 pop r14 .cfi_def_cfa_offset 24 pop r15 .cfi_def_cfa_offset 16 pop rbp .cfi_def_cfa_offset 8 ret ``` </details> to ```assembly simd_sum_slow: .cfi_startproc xor eax, eax cmp rsi, 16 jb .LBB12_4 shl rsi, 2 pxor xmm0, xmm0 pxor xmm1, xmm1 pxor xmm3, xmm3 pxor xmm2, xmm2 .p2align 4 .LBB12_2: movdqu xmm4, xmmword ptr [rdi] paddd xmm0, xmm4 movdqu xmm4, xmmword ptr [rdi + 16] paddd xmm1, xmm4 movdqu xmm4, xmmword ptr [rdi + 32] paddd xmm3, xmm4 movdqu xmm4, xmmword ptr [rdi + 48] paddd xmm2, xmm4 add rdi, 64 add rsi, -64 cmp rsi, 60 ja .LBB12_2 paddd xmm0, xmm3 paddd xmm1, xmm2 paddd xmm1, xmm0 pshufd xmm0, xmm1, 238 paddd xmm0, xmm1 pshufd xmm1, xmm0, 85 paddd xmm1, xmm0 movd eax, xmm1 .LBB12_4: ret ```
codegen for example (sourced from #98326 (comment))
goes from (znver4)
many asm
to