Skip to content

Commit 4154d94

Browse files
authored
Unrolled build for #149131
Rollup merge of #149131 - bend-n:optimize_slice_iter_next_chunk, r=Mark-Simulacrum optimize `slice::Iter::next_chunk` codegen for example (sourced from #98326 (comment)) ```rust pub fn simd_sum_slow(arr: &[u32]) -> u32 { const STEP_SIZE: usize = 16; let mut result = [0; STEP_SIZE]; let mut iter = arr.iter(); while let Ok(c) = iter.next_chunk::<STEP_SIZE>() { for (&n, r) in c.iter().zip(result.iter_mut()) { *r += n; } } result.iter().sum() } ``` goes from (znver4) <details> <summary>many asm</summary> ```assembly simd_sum_slow: .cfi_startproc push rbp .cfi_def_cfa_offset 16 push r15 .cfi_def_cfa_offset 24 push r14 .cfi_def_cfa_offset 32 push r13 .cfi_def_cfa_offset 40 push r12 .cfi_def_cfa_offset 48 push rbx .cfi_def_cfa_offset 56 sub rsp, 240 .cfi_def_cfa_offset 296 .cfi_offset rbx, -56 .cfi_offset r12, -48 .cfi_offset r13, -40 .cfi_offset r14, -32 .cfi_offset r15, -24 .cfi_offset rbp, -16 lea r12, [rdi + 4*rsi] mov qword ptr [rsp - 32], 0 mov dword ptr [rsp - 88], 0 mov dword ptr [rsp - 100], 0 mov dword ptr [rsp - 72], 0 mov dword ptr [rsp - 96], 0 mov dword ptr [rsp - 92], 0 mov dword ptr [rsp - 52], 0 mov dword ptr [rsp - 84], 0 mov dword ptr [rsp - 80], 0 mov dword ptr [rsp - 76], 0 mov dword ptr [rsp - 44], 0 mov dword ptr [rsp - 68], 0 mov dword ptr [rsp - 64], 0 mov dword ptr [rsp - 60], 0 mov dword ptr [rsp - 56], 0 mov dword ptr [rsp - 48], 0 mov qword ptr [rsp + 224], r12 cmp rdi, r12 mov qword ptr [rsp + 96], rdi je .LBB12_2 .p2align 4 .LBB12_3: lea r13, [rdi + 4] cmp r13, r12 je .LBB12_4 lea r15, [rdi + 8] cmp r15, r12 je .LBB12_6 lea r14, [rdi + 12] cmp r14, r12 je .LBB12_8 lea rbx, [rdi + 16] cmp rbx, r12 je .LBB12_10 lea r11, [rdi + 20] cmp r11, r12 je .LBB12_12 lea r10, [rdi + 24] cmp r10, r12 je .LBB12_14 lea r9, [rdi + 28] cmp r9, r12 je .LBB12_16 lea r8, [rdi + 32] cmp r8, r12 je .LBB12_18 lea rax, [rdi + 36] cmp rax, r12 je .LBB12_20 mov qword ptr [rsp - 120], rax lea rax, [rdi + 40] mov qword ptr [rsp - 112], rax cmp rax, r12 je .LBB12_22 lea rdx, [rdi + 44] cmp rdx, r12 je .LBB12_24 lea rbp, [rdi + 48] cmp rbp, r12 je .LBB12_26 mov qword ptr [rsp - 40], r9 lea rcx, [rdi + 52] cmp rcx, r12 je .LBB12_30 lea r9, [rdi + 56] cmp r9, r12 je .LBB12_32 lea rax, [rdi + 60] cmp rax, r12 mov qword ptr [rsp - 24], r9 je .LBB12_34 mov qword ptr [rsp + 88], rax lea rax, [rdi + 64] mov qword ptr [rsp - 8], rax mov dword ptr [rsp - 128], 0 mov qword ptr [rsp - 16], rdi mov qword ptr [rsp + 56], r10 lea r10, [rsp + 216] mov qword ptr [rsp + 40], r10 mov qword ptr [rsp + 96], r13 lea rax, [rsp + 208] mov qword ptr [rsp + 32], rax mov qword ptr [rsp + 72], rdx lea rsi, [rsp + 200] mov qword ptr [rsp + 48], rbx lea rbx, [rsp + 192] mov qword ptr [rsp + 80], rcx lea rcx, [rsp + 184] mov qword ptr [rsp + 64], rbp lea rdx, [rsp + 176] mov qword ptr [rsp + 16], r14 lea r14, [rsp + 168] mov qword ptr [rsp + 24], r11 lea r11, [rsp + 160] lea r9, [rsp + 152] lea r12, [rsp + 144] lea r13, [rsp + 136] lea rbp, [rsp + 128] mov qword ptr [rsp], r15 lea r15, [rsp + 120] mov qword ptr [rsp + 8], r8 lea r8, [rsp + 112] lea rdi, [rsp + 104] jmp .LBB12_39 .p2align 4 .LBB12_2: mov qword ptr [rsp - 128], 0 jmp .LBB12_37 .p2align 4 .LBB12_4: mov eax, 1 mov qword ptr [rsp - 128], rax jmp .LBB12_37 .p2align 4 .LBB12_6: mov eax, 2 mov qword ptr [rsp - 128], rax jmp .LBB12_36 .LBB12_8: mov eax, 3 mov qword ptr [rsp - 128], rax jmp .LBB12_36 .LBB12_10: mov eax, 4 mov qword ptr [rsp - 128], rax jmp .LBB12_36 .LBB12_12: mov eax, 5 mov qword ptr [rsp - 128], rax jmp .LBB12_36 .LBB12_14: mov eax, 6 mov qword ptr [rsp - 128], rax jmp .LBB12_36 .LBB12_16: mov eax, 7 mov qword ptr [rsp - 128], rax jmp .LBB12_36 .LBB12_18: mov eax, 8 mov qword ptr [rsp - 128], rax jmp .LBB12_36 .LBB12_20: mov eax, 9 mov qword ptr [rsp - 128], rax jmp .LBB12_36 .LBB12_22: mov eax, 10 mov qword ptr [rsp - 128], rax mov rdi, qword ptr [rsp - 120] jmp .LBB12_36 .LBB12_24: mov eax, 11 mov qword ptr [rsp - 128], rax jmp .LBB12_27 .LBB12_26: mov eax, 12 mov qword ptr [rsp - 128], rax .LBB12_27: mov rsi, qword ptr [rsp - 112] mov rdi, qword ptr [rsp - 120] jmp .LBB12_36 .LBB12_30: mov rcx, rbp mov eax, 13 mov qword ptr [rsp - 128], rax jmp .LBB12_35 .LBB12_32: mov rax, rcx mov rcx, rbp mov esi, 14 mov qword ptr [rsp - 128], rsi jmp .LBB12_35 .LBB12_34: mov rax, rcx mov rcx, rbp mov esi, 15 mov qword ptr [rsp - 128], rsi .LBB12_35: mov rsi, qword ptr [rsp - 112] mov rdi, qword ptr [rsp - 120] mov r9, qword ptr [rsp - 40] .p2align 4 .LBB12_36: mov rbp, r13 .LBB12_37: mov r13, qword ptr [rsp - 24] mov qword ptr [rsp + 88], r13 mov r13, qword ptr [rsp - 128] mov qword ptr [rsp + 216], r13 mov r13b, 1 mov dword ptr [rsp - 128], r13d mov qword ptr [rsp - 24], rax mov qword ptr [rsp + 80], rcx mov qword ptr [rsp + 64], rdx mov qword ptr [rsp + 72], rsi mov qword ptr [rsp - 112], rdi mov qword ptr [rsp - 120], r8 mov qword ptr [rsp + 8], r9 mov qword ptr [rsp - 40], r10 mov qword ptr [rsp + 56], r11 mov qword ptr [rsp + 24], rbx mov qword ptr [rsp + 48], r14 mov qword ptr [rsp + 16], r15 mov qword ptr [rsp], rbp mov eax, 0 mov qword ptr [rsp - 16], rax mov qword ptr [rsp - 8], r12 lea r10, [rsp + 208] mov qword ptr [rsp + 40], r10 lea rax, [rsp + 200] mov qword ptr [rsp + 32], rax lea rsi, [rsp + 192] lea rbx, [rsp + 184] lea rcx, [rsp + 176] lea rdx, [rsp + 168] lea r14, [rsp + 160] lea r11, [rsp + 152] lea r9, [rsp + 144] lea r12, [rsp + 136] lea r13, [rsp + 128] lea rbp, [rsp + 120] lea r15, [rsp + 112] lea r8, [rsp + 104] lea rdi, [rsp + 232] .LBB12_39: mov r10, qword ptr [rsp + 96] mov rax, qword ptr [rsp + 40] mov qword ptr [rax], r10 mov rax, qword ptr [rsp + 32] mov r10, qword ptr [rsp] mov qword ptr [rax], r10 mov rax, qword ptr [rsp + 16] mov qword ptr [rsi], rax mov rax, qword ptr [rsp + 48] mov qword ptr [rbx], rax mov rax, qword ptr [rsp + 24] mov qword ptr [rcx], rax mov rax, qword ptr [rsp + 56] mov qword ptr [rdx], rax mov rax, qword ptr [rsp - 40] mov qword ptr [r14], rax mov rax, qword ptr [rsp + 8] mov qword ptr [r11], rax mov rax, qword ptr [rsp - 120] mov qword ptr [r9], rax mov rax, qword ptr [rsp - 112] mov qword ptr [r12], rax mov rax, qword ptr [rsp + 72] mov qword ptr [r13], rax mov rax, qword ptr [rsp + 64] mov qword ptr [rbp], rax mov rax, qword ptr [rsp + 80] mov qword ptr [r15], rax mov rax, qword ptr [rsp - 24] mov qword ptr [r8], rax mov rax, qword ptr [rsp + 88] mov qword ptr [rdi], rax cmp byte ptr [rsp - 128], 0 jne .LBB12_40 mov rax, qword ptr [rsp - 32] mov rcx, qword ptr [rsp - 16] add eax, dword ptr [rcx] mov qword ptr [rsp - 32], rax mov rax, qword ptr [rsp + 216] mov ecx, dword ptr [rsp - 88] add ecx, dword ptr [rax] mov dword ptr [rsp - 88], ecx mov rax, qword ptr [rsp + 208] mov ecx, dword ptr [rsp - 100] add ecx, dword ptr [rax] mov dword ptr [rsp - 100], ecx mov rax, qword ptr [rsp + 200] mov ecx, dword ptr [rsp - 72] add ecx, dword ptr [rax] mov dword ptr [rsp - 72], ecx mov rax, qword ptr [rsp + 192] mov ecx, dword ptr [rsp - 96] add ecx, dword ptr [rax] mov dword ptr [rsp - 96], ecx mov rax, qword ptr [rsp + 184] mov ecx, dword ptr [rsp - 92] add ecx, dword ptr [rax] mov dword ptr [rsp - 92], ecx mov rax, qword ptr [rsp + 176] mov ecx, dword ptr [rsp - 52] add ecx, dword ptr [rax] mov dword ptr [rsp - 52], ecx mov rax, qword ptr [rsp + 168] mov ecx, dword ptr [rsp - 84] add ecx, dword ptr [rax] mov dword ptr [rsp - 84], ecx mov rax, qword ptr [rsp + 160] mov ecx, dword ptr [rsp - 80] add ecx, dword ptr [rax] mov dword ptr [rsp - 80], ecx mov rax, qword ptr [rsp + 152] mov ecx, dword ptr [rsp - 76] add ecx, dword ptr [rax] mov dword ptr [rsp - 76], ecx mov rax, qword ptr [rsp + 144] mov ecx, dword ptr [rsp - 44] add ecx, dword ptr [rax] mov dword ptr [rsp - 44], ecx mov rax, qword ptr [rsp + 136] mov ecx, dword ptr [rsp - 68] add ecx, dword ptr [rax] mov dword ptr [rsp - 68], ecx mov rax, qword ptr [rsp + 128] mov ecx, dword ptr [rsp - 64] add ecx, dword ptr [rax] mov dword ptr [rsp - 64], ecx mov rax, qword ptr [rsp + 120] mov ecx, dword ptr [rsp - 60] add ecx, dword ptr [rax] mov dword ptr [rsp - 60], ecx mov rax, qword ptr [rsp + 112] mov ecx, dword ptr [rsp - 56] add ecx, dword ptr [rax] mov dword ptr [rsp - 56], ecx mov rax, qword ptr [rsp + 104] mov ecx, dword ptr [rsp - 48] add ecx, dword ptr [rax] mov dword ptr [rsp - 48], ecx mov rdi, qword ptr [rsp - 8] mov r12, qword ptr [rsp + 224] cmp rdi, r12 mov qword ptr [rsp + 96], rdi jne .LBB12_3 jmp .LBB12_2 .LBB12_40: mov eax, dword ptr [rsp - 88] add eax, dword ptr [rsp - 32] mov ecx, dword ptr [rsp - 72] add ecx, dword ptr [rsp - 100] add ecx, eax mov edx, dword ptr [rsp - 92] add edx, dword ptr [rsp - 96] mov eax, dword ptr [rsp - 52] add eax, edx add eax, ecx mov ecx, dword ptr [rsp - 80] add ecx, dword ptr [rsp - 84] mov edx, dword ptr [rsp - 76] add edx, ecx mov ecx, dword ptr [rsp - 44] add ecx, edx add ecx, eax mov edx, dword ptr [rsp - 64] add edx, dword ptr [rsp - 68] mov eax, dword ptr [rsp - 60] add eax, edx mov edx, dword ptr [rsp - 56] add edx, eax mov eax, dword ptr [rsp - 48] add eax, edx add eax, ecx add rsp, 240 .cfi_def_cfa_offset 56 pop rbx .cfi_def_cfa_offset 48 pop r12 .cfi_def_cfa_offset 40 pop r13 .cfi_def_cfa_offset 32 pop r14 .cfi_def_cfa_offset 24 pop r15 .cfi_def_cfa_offset 16 pop rbp .cfi_def_cfa_offset 8 ret ``` </details> to ```assembly simd_sum_slow: .cfi_startproc xor eax, eax cmp rsi, 16 jb .LBB12_4 shl rsi, 2 pxor xmm0, xmm0 pxor xmm1, xmm1 pxor xmm3, xmm3 pxor xmm2, xmm2 .p2align 4 .LBB12_2: movdqu xmm4, xmmword ptr [rdi] paddd xmm0, xmm4 movdqu xmm4, xmmword ptr [rdi + 16] paddd xmm1, xmm4 movdqu xmm4, xmmword ptr [rdi + 32] paddd xmm3, xmm4 movdqu xmm4, xmmword ptr [rdi + 48] paddd xmm2, xmm4 add rdi, 64 add rsi, -64 cmp rsi, 60 ja .LBB12_2 paddd xmm0, xmm3 paddd xmm1, xmm2 paddd xmm1, xmm0 pshufd xmm0, xmm1, 238 paddd xmm0, xmm1 pshufd xmm1, xmm0, 85 paddd xmm1, xmm0 movd eax, xmm1 .LBB12_4: ret ```
2 parents 1be6b13 + 9a48fb6 commit 4154d94

File tree

2 files changed

+26
-2
lines changed

2 files changed

+26
-2
lines changed

library/core/src/slice/iter.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,7 @@ impl<'a, T> Iter<'a, T> {
139139
}
140140
}
141141

142-
iterator! {struct Iter -> *const T, &'a T, const, {/* no mut */}, as_ref, {
142+
iterator! {struct Iter -> *const T, &'a T, const, {/* no mut */}, as_ref, each_ref, {
143143
fn is_sorted_by<F>(self, mut compare: F) -> bool
144144
where
145145
Self: Sized,
@@ -368,7 +368,7 @@ impl<T> AsRef<[T]> for IterMut<'_, T> {
368368
// }
369369
// }
370370

371-
iterator! {struct IterMut -> *mut T, &'a mut T, mut, {mut}, as_mut, {}}
371+
iterator! {struct IterMut -> *mut T, &'a mut T, mut, {mut}, as_mut, each_mut, {}}
372372

373373
/// An internal abstraction over the splitting iterators, so that
374374
/// splitn, splitn_mut etc can be implemented once.

library/core/src/slice/iter/macros.rs

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ macro_rules! iterator {
6868
$raw_mut:tt,
6969
{$( $mut_:tt )?},
7070
$into_ref:ident,
71+
$array_ref:ident,
7172
{$($extra:tt)*}
7273
) => {
7374
impl<'a, T> $name<'a, T> {
@@ -190,6 +191,29 @@ macro_rules! iterator {
190191
}
191192
}
192193

194+
fn next_chunk<const N:usize>(&mut self) -> Result<[$elem; N], crate::array::IntoIter<$elem, N>> {
195+
if T::IS_ZST {
196+
return crate::array::iter_next_chunk(self);
197+
}
198+
let len = len!(self);
199+
if len >= N {
200+
// SAFETY: we are just getting an array of [T; N] and moving the pointer over a little
201+
let r = unsafe { self.post_inc_start(N).cast_array().$into_ref() }
202+
.$array_ref(); // must convert &[T; N] to [&T; N]
203+
Ok(r)
204+
} else {
205+
// cant use $array_ref because theres no builtin for &mut [MU<T>; N] -> [&mut MU<T>; N]
206+
// cant use copy_nonoverlapping as the $elem is of type &{mut} T instead of T
207+
let mut a = [const { crate::mem::MaybeUninit::<$elem>::uninit() }; N];
208+
for into in (&mut a).into_iter().take(len) {
209+
// SAFETY: take(n) limits to remainder (slice produces worse codegen)
210+
into.write(unsafe { self.post_inc_start(1).$into_ref() });
211+
}
212+
// SAFETY: we just initialized elements 0..len
213+
unsafe { Err(crate::array::IntoIter::new_unchecked(a, 0..len)) }
214+
}
215+
}
216+
193217
#[inline]
194218
fn size_hint(&self) -> (usize, Option<usize>) {
195219
let exact = len!(self);

0 commit comments

Comments
 (0)