strcat
Full source code
C++
// -*- mode: c++ -*-
// $Id$
// http://www.bagley.org/~doug/shootout/
// with help from PeterB
#include <cstdlib>
#include <iostream>
#include <string>
using namespace std;
int main(int argc, char *argv[])
{
#ifdef SMALL_PROBLEM_SIZE
#define LENGTH 1000000
#else
#define LENGTH 5000000
#endif
int i, n = ((argc == 2) ? atoi(argv[1]) : LENGTH);
string str;
size_t capacity = 31;
str.reserve(capacity); // as per C-string
size_t newLength = 6;
for (i = 0; i < n; i++)
{
if(newLength > capacity)
{
capacity *= 2;
str.reserve(capacity);
}
str += "hello\n";
newLength += 6;
}
cout << str.length() << endl;
return 0;
}
Rust
// Adapted from https://github.com/llvm/llvm-test-suite and // http://www.bagley.org/~doug/shootout/ // with help from PeterB use std::env; #[cfg(feature = "small_problem_size")] const LENGTH: i32 = 1000000; #[cfg(not(feature = "small_problem_size"))] const LENGTH: i32 = 5000000; fn main() { let mut args = env::args(); let n = if args.len() == 2 { args.nth(1).unwrap().parse::<i32>().unwrap() } else { LENGTH }; let mut str = String::new(); let mut capacity: usize = 31; str.reserve(capacity); let mut new_length: usize = 6; for _i in 0..n { if new_length > capacity { capacity *= 2; str.reserve(capacity); } str.push_str("hello\n"); new_length += 6; } println!("{}", str.len()); }
Porting notes
The porting was straightforward with no specific notes.
Performance analysis
The benchmark results show that the Rust version is more than 2 times faster than the C++ version.
We compare the generated code between C++ and Rust.
C++
C++ perfdata (the host function list)
Overhead Command Shared Object Symbol
19.78% strcat libstdc++.so.6.0.30 [.] std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::_M_append(char const*, unsigned long)
12.01% strcat libc.so.6 [.] __memmove_avx_unaligned_erms
8.25% strcat strcat [.] main
6.77% strcat [unknown] [k] 0xffffffff9fd9f917
3.87% strcat libc.so.6 [.] __memmove_avx_unaligned
2.43% strcat libstdc++.so.6.0.30 [.] memcpy@plt
1.95% strcat strcat [.] std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::_M_append(char const*, unsigned long)@plt
C++ perfdata (the hottest function std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::_M_append(char const*, unsigned long)@@GLIBCXX_3.4.21
)
7.22 │ endbr64
9.87 │ push %rbp
2.46 │ mov %rdx,%r8
│ push %rbx
4.92 │ mov %rdi,%rbx
2.46 │ sub $0x8,%rsp
2.46 │ mov 0x8(%rdi),%rax
│ mov (%rdi),%rdi
│ lea (%rdx,%rax,1),%rbp
2.46 │ lea 0x10(%rbx),%rdx
2.48 │ cmp %rdx,%rdi
│ ↓ je 78
│ mov 0x10(%rbx),%rdx
7.27 │28: cmp %rbp,%rdx
│ ↓ jb 60
4.87 │ test %r8,%r8
│ ↓ je 46
7.33 │ add %rax,%rdi
7.40 │ cmp $0x1,%r8
│ ↓ je 80
2.46 │ mov %r8,%rdx
7.35 │ → call memcpy@plt
│ mov (%rbx),%rdi
2.46 │46: mov %rbp,0x8(%rbx)
2.46 │ mov %rbx,%rax
2.37 │ movb $0x0,(%rdi,%rbp,1)
7.11 │ add $0x8,%rsp
9.64 │ pop %rbx
│ pop %rbp
4.94 │ ← ret
C++ perfdata (The second hottest function __memcpy_avx_unaligned_erms
)
16.08 │ 30: cmp $0x10,%edx
│ ↓ jae 62
8.10 │ cmp $0x8,%edx
│ ↓ jae 80
4.07 │ cmp $0x4,%edx
│ ↓ jae 55
│ cmp $0x1,%edx
│ ↓ jl 54
│ mov (%rsi),%cl
│ ↓ je 52
│ movzwl -0x2(%rsi,%rdx,1),%esi
│ mov %si,-0x2(%rdi,%rdx,1)
│ 52: mov %cl,(%rdi)
│ 54: ← ret
3.77 │ 55: mov -0x4(%rsi,%rdx,1),%ecx
4.06 │ mov (%rsi),%esi
12.17 │ mov %ecx,-0x4(%rdi,%rdx,1)
4.06 │ mov %esi,(%rdi)
12.05 │ ← ret
C++ LLVM IR (the loop in the main function, the frontend output)
16: ; preds = %14, %38
%17 = phi i64 [ %39, %38 ], [ 6, %14 ]
%18 = phi i64 [ %30, %38 ], [ 31, %14 ]
%19 = phi i32 [ %40, %38 ], [ 0, %14 ]
%20 = icmp ugt i64 %17, %18
br i1 %20, label %21, label %29
21: ; preds = %16
%22 = shl i64 %18, 1
invoke void @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE7reserveEm(ptr noundef nonnull align 8 dereferenceable(32) %3, i64 noundef %22)
to label %29 unwind label %25
23: ; preds = %10
%24 = landingpad { ptr, i32 }
cleanup
br label %83
25: ; preds = %21, %36
%26 = landingpad { ptr, i32 }
cleanup
br label %83
27: ; preds = %34, %42, %53, %62, %63, %68, %71
%28 = landingpad { ptr, i32 }
cleanup
br label %83
29: ; preds = %21, %16
%30 = phi i64 [ %22, %21 ], [ %18, %16 ]
%31 = load i64, ptr %13, align 8, !tbaa !12
%32 = add i64 %31, -4611686018427387898
%33 = icmp ult i64 %32, 6
br i1 %33, label %34, label %36
34: ; preds = %29
invoke void @_ZSt20__throw_length_errorPKc(ptr noundef nonnull @.str.1) #11
to label %35 unwind label %27
35: ; preds = %34
unreachable
36: ; preds = %29
%37 = invoke noundef nonnull align 8 dereferenceable(32) ptr @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_appendEPKcm(ptr noundef nonnull align 8 dereferenceable(32) %3, ptr noundef nonnull @.str, i64 noundef 6
)
to label %38 unwind label %25
38: ; preds = %36
%39 = add nuw nsw i64 %17, 6
%40 = add nuw nsw i32 %19, 1
%41 = icmp eq i32 %40, %11
br i1 %41, label %42, label %16, !llvm.loop !16
The loop body in the main function doesn't have the basic_string::append
and its internal memcpy
inlined. IThey aren't probably defined in the C++ string
and the string.h
header files as its body is not part of the compilation unit. Even though it uses the SIMD (AVX) version of the memcpy
, it doesn't use the AVX instructions because it copies only 6 bytes at a time.
Rust
Rust perfdata (the hot function list)
Overhead Command Shared Object Symbol
34.98% strcat strcat [.] strcat::main
Rust perfdata
│1d0: mov 0x8(%rsp),%rax
4.84 │ cmp %rbx,%r12
│ ↓ jbe 1e8
│ add %rbx,%rbx
│ mov %rax,%rcx
│ sub %rsi,%rcx
│ cmp %rbx,%rcx
│ ↓ jb 218
3.24 │1e8: sub %rsi,%rax
18.89 │ cmp $0x5,%rax
│ ↓ jbe 236
3.25 │1f1: mov 0x10(%rsp),%rax
14.99 │ movw $0xa6f,0x4(%rax,%rsi,1)
19.38 │ movl $0x6c6c6568,(%rax,%rsi,1)
2.98 │ add $0x6,%rsi
13.03 │ mov %rsi,0x18(%rsp)
3.25 │ add $0x6,%r12
3.27 │ dec %r15d
12.90 │ ↑ jne 1d0
Rust LLVM IR (the loop body in the main function, the frontend output)
bb29: ; preds = %bb8.preheader, %bb32
%len.i.i.i.pre62 = phi i64 [ %49, %bb32 ], [ %len.i.i.i.pre62.pre, %bb8.preheader ]
%capacity.sroa.0.051 = phi i64 [ %capacity.sroa.0.1, %bb32 ], [ 31, %bb8.preheader ]
%new_length.sroa.0.050 = phi i64 [ %50, %bb32 ], [ 6, %bb8.preheader ]
%iter.sroa.0.049 = phi i32 [ %47, %bb32 ], [ 0, %bb8.preheader ]
%47 = add nuw nsw i32 %iter.sroa.0.049, 1
%_18 = icmp ugt i64 %new_length.sroa.0.050, %capacity.sroa.0.051
%self2.i.i.i.pre64 = load i64, ptr %str, align 8, !range !33
br i1 %_18, label %bb10, label %bb12
bb10: ; preds = %bb29
%48 = shl i64 %capacity.sroa.0.051, 1
%_10.i31 = sub i64 %self2.i.i.i.pre64, %len.i.i.i.pre62
%_7.i32 = icmp ult i64 %_10.i31, %48
br i1 %_7.i32, label %bb1.i33, label %bb12, !prof !32
bb1.i33: ; preds = %bb10
; invoke alloc::raw_vec::RawVecInner<A>::reserve::do_reserve_and_handle
invoke fastcc void @"_ZN5alloc7raw_vec20RawVecInner$LT$A$GT$7reserve21do_reserve_and_handle17h0f49121d9a52801dE"(ptr noalias noundef nonnull align 8 dereferenceable(16) %str, i64 noundef %len.i.i.i.pre62, i64 noundef %48)
to label %bb1.i33.bb12_crit_edge unwind label %cleanup2.loopexit
bb1.i33.bb12_crit_edge: ; preds = %bb1.i33
%len.i.i.i.pre = load i64, ptr %_40.sroa.5.0.str.sroa_idx, align 8, !alias.scope !64
%self2.i.i.i.pre = load i64, ptr %str, align 8, !range !33, !alias.scope !69
br label %bb12
bb12: ; preds = %bb1.i33.bb12_crit_edge, %bb10, %bb29
%self2.i.i.i = phi i64 [ %self2.i.i.i.pre64, %bb29 ], [ %self2.i.i.i.pre, %bb1.i33.bb12_crit_edge ], [ %self2.i.i.i.pre64, %bb10 ]
%len.i.i.i = phi i64 [ %len.i.i.i.pre62, %bb29 ], [ %len.i.i.i.pre, %bb1.i33.bb12_crit_edge ], [ %len.i.i.i.pre62, %bb10 ]
%capacity.sroa.0.1 = phi i64 [ %capacity.sroa.0.051, %bb29 ], [ %48, %bb1.i33.bb12_crit_edge ], [ %48, %bb10 ]
call void @llvm.experimental.noalias.scope.decl(metadata !72)
call void @llvm.experimental.noalias.scope.decl(metadata !73)
%_10.i.i.i = sub i64 %self2.i.i.i, %len.i.i.i
%_7.i.i.i = icmp ult i64 %_10.i.i.i, 6
br i1 %_7.i.i.i, label %bb1.i.i.i, label %bb32, !prof !32
bb1.i.i.i: ; preds = %bb12
; invoke alloc::raw_vec::RawVecInner<A>::reserve::do_reserve_and_handle
invoke fastcc void @"_ZN5alloc7raw_vec20RawVecInner$LT$A$GT$7reserve21do_reserve_and_handle17h0f49121d9a52801dE"(ptr noalias noundef nonnull align 8 dereferenceable(16) %str, i64 noundef %len.i.i.i, i64 noundef 6)
to label %.noexc36 unwind label %cleanup2.loopexit
.noexc36: ; preds = %bb1.i.i.i
%len.pre.i.i = load i64, ptr %_40.sroa.5.0.str.sroa_idx, align 8, !alias.scope !64
br label %bb32
bb32: ; preds = %.noexc36, %bb12
%len.i.i = phi i64 [ %len.i.i.i, %bb12 ], [ %len.pre.i.i, %.noexc36 ]
%_9.i.i = icmp sgt i64 %len.i.i, -1
call void @llvm.assume(i1 %_9.i.i)
%_10.i.i = load ptr, ptr %_40.sroa.4.0.str.sroa_idx, align 8, !alias.scope !64, !nonnull !4, !noundef !4
%dst.i.i = getelementptr inbounds i8, ptr %_10.i.i, i64 %len.i.i
call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(6) %dst.i.i, ptr noundef nonnull align 1 dereferenceable(6) @alloc_67205f92287559ab7be9a96d10923d6c, i64 6, i1 false), !noalias !64
%49 = add nuw i64 %len.i.i, 6
store i64 %49, ptr %_40.sroa.5.0.str.sroa_idx, align 8, !alias.scope !64
%50 = add nuw nsw i64 %new_length.sroa.0.050, 6
%exitcond.not = icmp eq i32 %47, %n.sroa.0.0
br i1 %exitcond.not, label %bb30, label %bb29
The hottest of the loop is fully inlined into the main function and is a straightforward memory copy from the string literal hello\n
into the string buffer with occasional size expansion through the reverse
call.
I believe that the fact that the push_str
and memcpy
functions are inlined in the Rust version explains the the performance difference.