Use `load`+`store` instead of `memcpy` for small integer arrays by scottmcm · Pu...
source link: https://github.com/rust-lang/rust/pull/111999
Go to the source link to view the article. You can view the picture content, updated content and better typesetting reading experience. If the link is broken, please click the button below to view the snapshot at that time.
Conversation
Member
I was inspired by #98892 to see whether, rather than making mem::swap
do something smart in the library, we could update MIR assignments like *_1 = *_2
to do something smarter than memcpy
for sufficiently-small types that doing it inline is going to be better than a memcpy
call in assembly anyway. After all, special code may help mem::swap
, but if the "obvious" MIR can just result in the correct thing that helps everything -- other code like mem::replace
, people doing it manually, and just passing around by value in general -- as well as makes MIR inlining happier since it doesn't need to deal with all the complicated library code if it just sees a couple assignments.
LLVM will turn the short, known-length memcpy
s into direct instructions in the backend, but that's too late for it to be able to remove alloca
s. In general, replacing memcpy
s with typed instructions is hard in the middle-end -- even for memcpy.inline
where it knows it won't be a function call -- is hard due to poison propagation issues. So because we know more about the type invariants -- these are typed copies -- rustc can emit something more specific, allowing LLVM to mem2reg
away the alloca
s in some situations.
#52051 previously did something like this in the library for mem::swap
, but it ended up regressing during enabling mir inlining (cbbf06b), so this has been suboptimal on stable for ≈5 releases now.
The code in this PR is narrowly targeted at just integer arrays in LLVM, but works via a new method on the LayoutTypeMethods
trait, so specific backends based on cg_ssa can enable this for more situations over time, as we find them. I don't want to try to bite off too much in this PR, though. (Transparent newtypes and simple things like the 3×usize String
would be obvious candidates for a follow-up.)
Codegen demonstrations: https://llvm.godbolt.org/z/fK8hT9aqv
Before:
define void @swap_rgb48_old(ptr noalias nocapture noundef align 2 dereferenceable(6) %x, ptr noalias nocapture noundef align 2 dereferenceable(6) %y) unnamed_addr #1 {
%a.i = alloca [3 x i16], align 2
call void @llvm.lifetime.start.p0(i64 6, ptr nonnull %a.i)
call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 2 dereferenceable(6) %a.i, ptr noundef nonnull align 2 dereferenceable(6) %x, i64 6, i1 false)
tail call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 2 dereferenceable(6) %x, ptr noundef nonnull align 2 dereferenceable(6) %y, i64 6, i1 false)
call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 2 dereferenceable(6) %y, ptr noundef nonnull align 2 dereferenceable(6) %a.i, i64 6, i1 false)
call void @llvm.lifetime.end.p0(i64 6, ptr nonnull %a.i)
ret void
}
Note it going to stack:
swap_rgb48_old: # @swap_rgb48_old
movzx eax, word ptr [rdi + 4]
mov word ptr [rsp - 4], ax
mov eax, dword ptr [rdi]
mov dword ptr [rsp - 8], eax
movzx eax, word ptr [rsi + 4]
mov word ptr [rdi + 4], ax
mov eax, dword ptr [rsi]
mov dword ptr [rdi], eax
movzx eax, word ptr [rsp - 4]
mov word ptr [rsi + 4], ax
mov eax, dword ptr [rsp - 8]
mov dword ptr [rsi], eax
ret
define void @swap_rgb48(ptr noalias nocapture noundef align 2 dereferenceable(6) %x, ptr noalias nocapture noundef align 2 dereferenceable(6) %y) unnamed_addr #0 {
start:
%0 = load <3 x i16>, ptr %x, align 2
%1 = load <3 x i16>, ptr %y, align 2
store <3 x i16> %1, ptr %x, align 2
store <3 x i16> %0, ptr %y, align 2
ret void
}
still lowers to dword
+word
operations, but has no stack traffic:
swap_rgb48: # @swap_rgb48
mov eax, dword ptr [rdi]
movzx ecx, word ptr [rdi + 4]
movzx edx, word ptr [rsi + 4]
mov r8d, dword ptr [rsi]
mov dword ptr [rdi], r8d
mov word ptr [rdi + 4], dx
mov word ptr [rsi + 4], cx
mov dword ptr [rsi], eax
ret
And as a demonstration that this isn't just mem::swap
, a mem::replace
on a small array (since replace doesn't use swap since #83022), which used to be memcpy
s in LLVM changes in IR
define void @replace_short_array(ptr noalias nocapture noundef sret([3 x i32]) dereferenceable(12) %0, ptr noalias noundef align 4 dereferenceable(12) %r, ptr noalias nocapture noundef readonly dereferenceable(12) %v) unnamed_addr #0 {
start:
%1 = load <3 x i32>, ptr %r, align 4
store <3 x i32> %1, ptr %0, align 4
%2 = load <3 x i32>, ptr %v, align 4
store <3 x i32> %2, ptr %r, align 4
ret void
}
but that lowers to reasonable dword
+qword
instructions still
replace_short_array: # @replace_short_array
mov rax, rdi
mov rcx, qword ptr [rsi]
mov edi, dword ptr [rsi + 8]
mov dword ptr [rax + 8], edi
mov qword ptr [rax], rcx
mov rcx, qword ptr [rdx]
mov edx, dword ptr [rdx + 8]
mov dword ptr [rsi + 8], edx
mov qword ptr [rsi], rcx
ret
Recommend
About Joyk
Aggregate valuable and interesting links.
Joyk means Joy of geeK