strcat

Full source code

C++

// -*- mode: c++ -*-
// $Id$
// http://www.bagley.org/~doug/shootout/
// with help from PeterB

#include <cstdlib>
#include <iostream>
#include <string>
using namespace std;

int main(int argc, char *argv[])
{
#ifdef SMALL_PROBLEM_SIZE
#define LENGTH 1000000
#else
#define LENGTH 5000000
#endif
    int i, n = ((argc == 2) ? atoi(argv[1]) : LENGTH);
    string str;
    size_t capacity = 31;
    str.reserve(capacity); // as per C-string
    size_t newLength = 6;
    for (i = 0; i < n; i++)
    {
        if(newLength > capacity)
        {
            capacity *= 2;
            str.reserve(capacity);
        }
        str += "hello\n";
        newLength += 6;
    }
    cout << str.length() << endl;
    return 0;
}

Rust

// Adapted from https://github.com/llvm/llvm-test-suite and
// http://www.bagley.org/~doug/shootout/
// with help from PeterB
use std::env;

#[cfg(feature = "small_problem_size")]
const LENGTH: i32 = 1000000;

#[cfg(not(feature = "small_problem_size"))]
const LENGTH: i32 = 5000000;

fn main() {
    let mut args = env::args();
    let n = if args.len() == 2 {
        args.nth(1).unwrap().parse::<i32>().unwrap()
    } else {
        LENGTH
    };

    let mut str = String::new();
    let mut capacity: usize = 31;
    str.reserve(capacity);
    let mut new_length: usize = 6;
    for _i in 0..n {
        if new_length > capacity {
            capacity *= 2;
            str.reserve(capacity);
        }
        str.push_str("hello\n");
        new_length += 6;
    }
    println!("{}", str.len());
}

Porting notes

The porting was straightforward with no specific notes.

Performance analysis

The benchmark results show that the Rust version is more than 2 times faster than the C++ version.

We compare the generated code between C++ and Rust.

C++

C++ perfdata (the host function list)

Overhead  Command  Shared Object         Symbol
  19.78%  strcat   libstdc++.so.6.0.30   [.] std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::_M_append(char const*, unsigned long)
  12.01%  strcat   libc.so.6             [.] __memmove_avx_unaligned_erms
   8.25%  strcat   strcat                [.] main
   6.77%  strcat   [unknown]             [k] 0xffffffff9fd9f917
   3.87%  strcat   libc.so.6             [.] __memmove_avx_unaligned
   2.43%  strcat   libstdc++.so.6.0.30   [.] memcpy@plt
   1.95%  strcat   strcat                [.] std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::_M_append(char const*, unsigned long)@plt

C++ perfdata (the hottest function std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::_M_append(char const*, unsigned long)@@GLIBCXX_3.4.21)

  7.22 │      endbr64
  9.87 │      push    %rbp
  2.46 │      mov     %rdx,%r8
       │      push    %rbx
  4.92 │      mov     %rdi,%rbx
  2.46 │      sub     $0x8,%rsp
  2.46 │      mov     0x8(%rdi),%rax
       │      mov     (%rdi),%rdi
       │      lea     (%rdx,%rax,1),%rbp
  2.46 │      lea     0x10(%rbx),%rdx
  2.48 │      cmp     %rdx,%rdi
       │    ↓ je      78
       │      mov     0x10(%rbx),%rdx
  7.27 │28:   cmp     %rbp,%rdx
       │    ↓ jb      60
  4.87 │      test    %r8,%r8
       │    ↓ je      46
  7.33 │      add     %rax,%rdi
  7.40 │      cmp     $0x1,%r8
       │    ↓ je      80
  2.46 │      mov     %r8,%rdx
  7.35 │    → call    memcpy@plt
       │      mov     (%rbx),%rdi
  2.46 │46:   mov     %rbp,0x8(%rbx)
  2.46 │      mov     %rbx,%rax
  2.37 │      movb    $0x0,(%rdi,%rbp,1)
  7.11 │      add     $0x8,%rsp
  9.64 │      pop     %rbx
       │      pop     %rbp
  4.94 │    ← ret

C++ perfdata (The second hottest function __memcpy_avx_unaligned_erms)

 16.08 │ 30:   cmp        $0x10,%edx
       │     ↓ jae        62
  8.10 │       cmp        $0x8,%edx
       │     ↓ jae        80
  4.07 │       cmp        $0x4,%edx
       │     ↓ jae        55
       │       cmp        $0x1,%edx
       │     ↓ jl         54
       │       mov        (%rsi),%cl
       │     ↓ je         52
       │       movzwl     -0x2(%rsi,%rdx,1),%esi
       │       mov        %si,-0x2(%rdi,%rdx,1)
       │ 52:   mov        %cl,(%rdi)
       │ 54: ← ret
  3.77 │ 55:   mov        -0x4(%rsi,%rdx,1),%ecx
  4.06 │       mov        (%rsi),%esi
 12.17 │       mov        %ecx,-0x4(%rdi,%rdx,1)
  4.06 │       mov        %esi,(%rdi)
 12.05 │     ← ret

C++ LLVM IR (the loop in the main function, the frontend output)

16:                                               ; preds = %14, %38
  %17 = phi i64 [ %39, %38 ], [ 6, %14 ]
  %18 = phi i64 [ %30, %38 ], [ 31, %14 ]
  %19 = phi i32 [ %40, %38 ], [ 0, %14 ]
  %20 = icmp ugt i64 %17, %18
  br i1 %20, label %21, label %29

21:                                               ; preds = %16
  %22 = shl i64 %18, 1
  invoke void @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE7reserveEm(ptr noundef nonnull align 8 dereferenceable(32) %3, i64 noundef %22)
          to label %29 unwind label %25

23:                                               ; preds = %10
  %24 = landingpad { ptr, i32 }
          cleanup
  br label %83

25:                                               ; preds = %21, %36
  %26 = landingpad { ptr, i32 }
          cleanup
  br label %83

27:                                               ; preds = %34, %42, %53, %62, %63, %68, %71
  %28 = landingpad { ptr, i32 }
          cleanup
  br label %83

29:                                               ; preds = %21, %16
  %30 = phi i64 [ %22, %21 ], [ %18, %16 ]
  %31 = load i64, ptr %13, align 8, !tbaa !12
  %32 = add i64 %31, -4611686018427387898
  %33 = icmp ult i64 %32, 6
  br i1 %33, label %34, label %36

34:                                               ; preds = %29
  invoke void @_ZSt20__throw_length_errorPKc(ptr noundef nonnull @.str.1) #11
          to label %35 unwind label %27

35:                                               ; preds = %34
  unreachable

36:                                               ; preds = %29
  %37 = invoke noundef nonnull align 8 dereferenceable(32) ptr @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_appendEPKcm(ptr noundef nonnull align 8 dereferenceable(32) %3, ptr noundef nonnull @.str, i64 noundef 6
)
          to label %38 unwind label %25

38:                                               ; preds = %36
  %39 = add nuw nsw i64 %17, 6
  %40 = add nuw nsw i32 %19, 1
  %41 = icmp eq i32 %40, %11
  br i1 %41, label %42, label %16, !llvm.loop !16

The loop body in the main function doesn't have the basic_string::append and its internal memcpy inlined. IThey aren't probably defined in the C++ string and the string.h header files as its body is not part of the compilation unit. Even though it uses the SIMD (AVX) version of the memcpy, it doesn't use the AVX instructions because it copies only 6 bytes at a time.

Rust

Rust perfdata (the hot function list)

Overhead  Command  Shared Object     Symbol
  34.98%  strcat   strcat            [.] strcat::main

Rust perfdata

       │1d0:   mov    0x8(%rsp),%rax
  4.84 │       cmp    %rbx,%r12
       │     ↓ jbe    1e8
       │       add    %rbx,%rbx
       │       mov    %rax,%rcx
       │       sub    %rsi,%rcx
       │       cmp    %rbx,%rcx
       │     ↓ jb     218
  3.24 │1e8:   sub    %rsi,%rax
 18.89 │       cmp    $0x5,%rax
       │     ↓ jbe    236
  3.25 │1f1:   mov    0x10(%rsp),%rax
 14.99 │       movw   $0xa6f,0x4(%rax,%rsi,1)
 19.38 │       movl   $0x6c6c6568,(%rax,%rsi,1)
  2.98 │       add    $0x6,%rsi
 13.03 │       mov    %rsi,0x18(%rsp)
  3.25 │       add    $0x6,%r12
  3.27 │       dec    %r15d
 12.90 │     ↑ jne    1d0

Rust LLVM IR (the loop body in the main function, the frontend output)

bb29:                                             ; preds = %bb8.preheader, %bb32
  %len.i.i.i.pre62 = phi i64 [ %49, %bb32 ], [ %len.i.i.i.pre62.pre, %bb8.preheader ]
  %capacity.sroa.0.051 = phi i64 [ %capacity.sroa.0.1, %bb32 ], [ 31, %bb8.preheader ]
  %new_length.sroa.0.050 = phi i64 [ %50, %bb32 ], [ 6, %bb8.preheader ]
  %iter.sroa.0.049 = phi i32 [ %47, %bb32 ], [ 0, %bb8.preheader ]
  %47 = add nuw nsw i32 %iter.sroa.0.049, 1
  %_18 = icmp ugt i64 %new_length.sroa.0.050, %capacity.sroa.0.051
  %self2.i.i.i.pre64 = load i64, ptr %str, align 8, !range !33
  br i1 %_18, label %bb10, label %bb12

bb10:                                             ; preds = %bb29
  %48 = shl i64 %capacity.sroa.0.051, 1
  %_10.i31 = sub i64 %self2.i.i.i.pre64, %len.i.i.i.pre62
  %_7.i32 = icmp ult i64 %_10.i31, %48
  br i1 %_7.i32, label %bb1.i33, label %bb12, !prof !32

bb1.i33:                                          ; preds = %bb10
; invoke alloc::raw_vec::RawVecInner<A>::reserve::do_reserve_and_handle
  invoke fastcc void @"_ZN5alloc7raw_vec20RawVecInner$LT$A$GT$7reserve21do_reserve_and_handle17h0f49121d9a52801dE"(ptr noalias noundef nonnull align 8 dereferenceable(16) %str, i64 noundef %len.i.i.i.pre62, i64 noundef %48)
          to label %bb1.i33.bb12_crit_edge unwind label %cleanup2.loopexit

bb1.i33.bb12_crit_edge:                           ; preds = %bb1.i33
  %len.i.i.i.pre = load i64, ptr %_40.sroa.5.0.str.sroa_idx, align 8, !alias.scope !64
  %self2.i.i.i.pre = load i64, ptr %str, align 8, !range !33, !alias.scope !69
  br label %bb12

bb12:                                             ; preds = %bb1.i33.bb12_crit_edge, %bb10, %bb29
  %self2.i.i.i = phi i64 [ %self2.i.i.i.pre64, %bb29 ], [ %self2.i.i.i.pre, %bb1.i33.bb12_crit_edge ], [ %self2.i.i.i.pre64, %bb10 ]
  %len.i.i.i = phi i64 [ %len.i.i.i.pre62, %bb29 ], [ %len.i.i.i.pre, %bb1.i33.bb12_crit_edge ], [ %len.i.i.i.pre62, %bb10 ]
  %capacity.sroa.0.1 = phi i64 [ %capacity.sroa.0.051, %bb29 ], [ %48, %bb1.i33.bb12_crit_edge ], [ %48, %bb10 ]
  call void @llvm.experimental.noalias.scope.decl(metadata !72)
  call void @llvm.experimental.noalias.scope.decl(metadata !73)
  %_10.i.i.i = sub i64 %self2.i.i.i, %len.i.i.i
  %_7.i.i.i = icmp ult i64 %_10.i.i.i, 6
  br i1 %_7.i.i.i, label %bb1.i.i.i, label %bb32, !prof !32

bb1.i.i.i:                                        ; preds = %bb12
; invoke alloc::raw_vec::RawVecInner<A>::reserve::do_reserve_and_handle
  invoke fastcc void @"_ZN5alloc7raw_vec20RawVecInner$LT$A$GT$7reserve21do_reserve_and_handle17h0f49121d9a52801dE"(ptr noalias noundef nonnull align 8 dereferenceable(16) %str, i64 noundef %len.i.i.i, i64 noundef 6)
          to label %.noexc36 unwind label %cleanup2.loopexit

.noexc36:                                         ; preds = %bb1.i.i.i
  %len.pre.i.i = load i64, ptr %_40.sroa.5.0.str.sroa_idx, align 8, !alias.scope !64
  br label %bb32

bb32:                                             ; preds = %.noexc36, %bb12
  %len.i.i = phi i64 [ %len.i.i.i, %bb12 ], [ %len.pre.i.i, %.noexc36 ]
  %_9.i.i = icmp sgt i64 %len.i.i, -1
  call void @llvm.assume(i1 %_9.i.i)
  %_10.i.i = load ptr, ptr %_40.sroa.4.0.str.sroa_idx, align 8, !alias.scope !64, !nonnull !4, !noundef !4
  %dst.i.i = getelementptr inbounds i8, ptr %_10.i.i, i64 %len.i.i
  call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(6) %dst.i.i, ptr noundef nonnull align 1 dereferenceable(6) @alloc_67205f92287559ab7be9a96d10923d6c, i64 6, i1 false), !noalias !64
  %49 = add nuw i64 %len.i.i, 6
  store i64 %49, ptr %_40.sroa.5.0.str.sroa_idx, align 8, !alias.scope !64
  %50 = add nuw nsw i64 %new_length.sroa.0.050, 6
  %exitcond.not = icmp eq i32 %47, %n.sroa.0.0
  br i1 %exitcond.not, label %bb30, label %bb29

The hottest of the loop is fully inlined into the main function and is a straightforward memory copy from the string literal hello\n into the string buffer with occasional size expansion through the reverse call.

I believe that the fact that the push_str and memcpy functions are inlined in the Rust version explains the the performance difference.