ary3

Full source code

C++

// -*- mode: c++ -*-
// $Id$
// http://www.bagley.org/~doug/shootout/

#include <cstdlib>
#include <iostream>
#include <vector>

using namespace std;

int main(int argc, char *argv[]) {
#ifdef SMALL_PROBLEM_SIZE
#define LENGTH 150000
#else
#define LENGTH 1500000
#endif
    int i, k, n = ((argc == 2) ? atoi(argv[1]) : LENGTH);
    typedef vector<int> ARY;
    ARY x(n);
    ARY y(n);

    for (i=0; i<n; i++) {
        x[i] = i + 1;
    }
    for (k=0; k<1000; k++) {
        for (int i = n - 1; i >= 0; --i) {
            y[i] += x[i];
        }
    }

    cout << y[0] << " " << y.back() << endl;
}

Rust

// Adapted from https://github.com/llvm/llvm-test-suite and
// http://www.bagley.org/~doug/shootout/
use std::env;

#[cfg(feature = "small_problem_size")]
const LENGTH: i32 = 150000;

#[cfg(not(feature = "small_problem_size"))]
const LENGTH: i32 = 1500000;

fn main() {
    let mut args = env::args();
    let n = if args.len() == 2 {
        args.nth(1).unwrap().parse::<i32>().unwrap()
    } else {
        LENGTH
    };

    let mut x = Vec::<i32>::new();
    x.resize(n as usize, 0);
    let mut y = Vec::<i32>::new();
    y.resize(n as usize, 0);

    for i in 0..n {
        x[i as usize] = i + 1;
    }

    for _k in 0..1000 {
        for i in (0..n).rev() {
            y[i as usize] += x[i as usize];
        }
    }

    println!("{} {}", y[0], y.iter().last().unwrap());
}

Porting notes

A nested loop and an unused loop index variable

C++

    for (k=0; k<1000; k++) {
	for (int i = n - 1; i >= 0; --i) {
	    y[i] += x[i];
	}
    }

Rust

#![allow(unused)]
fn main() {
    for _k in 0..1000 {
        for i in (0..n).rev() {
            y[i as usize] += x[i as usize];
        }
    }
}

A nested loop is written in a way similar to C++.

The Rust compiler emits a warning for an unused loop variable, as in _k, to suggest adding the preceding underscore in its name though it doesn't cause an error.

Performance analysis

The benchmark results show that the Rust version is more than 4 times slower than the C++ version.

We compare the generated code between C++ and Rust.

C++

Here is the hot loop according to the Linux Perf:

C++ perf data

  7.51 │1c0:   movdqu  -0x10(%rsi,%r10,4),%xmm0
  7.65 │       movdqu  (%rsi,%r10,4),%xmm1
 14.29 │       movdqu  -0x10(%rdi,%r10,4),%xmm2
  7.07 │       paddd   %xmm0,%xmm2
  8.11 │       movdqu  (%rdi,%r10,4),%xmm0
  9.28 │       paddd   %xmm1,%xmm0
  8.99 │       movdqu  %xmm0,(%rdi,%r10,4)
 18.54 │       movdqu  %xmm2,-0x10(%rdi,%r10,4)
  9.43 │       add     $0xfffffffffffffff8,%r10
  8.98 │       cmp     %r10,%r8
       │     ↑ jne     1c0

This corresponds to the nested loop in the second loop in the source code.

C++ LLVM IR (the frontend output)

77:                                               ; preds = %76, %77
  %78 = phi i64 [ %93, %77 ], [ 0, %76 ]
  %79 = xor i64 %78, -1
  %80 = add i64 %79, %40
  %81 = getelementptr inbounds nuw i32, ptr %33, i64 %80
  %82 = getelementptr inbounds i8, ptr %81, i64 -12
  %83 = getelementptr inbounds i8, ptr %81, i64 -28
  %84 = load <4 x i32>, ptr %82, align 4, !tbaa !10
  %85 = load <4 x i32>, ptr %83, align 4, !tbaa !10
  %86 = getelementptr inbounds nuw i32, ptr %36, i64 %80
  %87 = getelementptr inbounds i8, ptr %86, i64 -12
  %88 = getelementptr inbounds i8, ptr %86, i64 -28
  %89 = load <4 x i32>, ptr %87, align 4, !tbaa !10
  %90 = load <4 x i32>, ptr %88, align 4, !tbaa !10
  %91 = add nsw <4 x i32> %89, %84
  %92 = add nsw <4 x i32> %90, %85
  store <4 x i32> %91, ptr %87, align 4, !tbaa !10
  store <4 x i32> %92, ptr %88, align 4, !tbaa !10
  %93 = add nuw i64 %78, 8
  %94 = icmp eq i64 %93, %43
  br i1 %94, label %95, label %77, !llvm.loop !17

C++ LLVM IR (right before the Machine IR generation)

87:                                               ; preds = %.preheader, %87
  %lsr.iv = phi i64 [ 0, %.preheader ], [ %lsr.iv.next, %87 ]
  %88 = shl nsw i64 %lsr.iv, 2
  %scevgep8 = getelementptr i8, ptr %scevgep5, i64 %88
  %89 = shl nsw i64 %lsr.iv, 2
  %scevgep6 = getelementptr i8, ptr %scevgep5, i64 %89
  %scevgep7 = getelementptr i8, ptr %scevgep6, i64 -16
  %90 = load <4 x i32>, ptr %scevgep8, align 4, !tbaa !10
  %91 = load <4 x i32>, ptr %scevgep7, align 4, !tbaa !10
  %92 = shl nsw i64 %lsr.iv, 2
  %scevgep4 = getelementptr i8, ptr %scevgep, i64 %92
  %93 = shl nsw i64 %lsr.iv, 2
  %scevgep2 = getelementptr i8, ptr %scevgep, i64 %93
  %scevgep3 = getelementptr i8, ptr %scevgep2, i64 -16
  %94 = load <4 x i32>, ptr %scevgep4, align 4, !tbaa !10
  %95 = load <4 x i32>, ptr %scevgep3, align 4, !tbaa !10
  %96 = add nsw <4 x i32> %94, %90
  %97 = add nsw <4 x i32> %95, %91
  store <4 x i32> %96, ptr %scevgep4, align 4, !tbaa !10
  store <4 x i32> %97, ptr %scevgep3, align 4, !tbaa !10
  %lsr.iv.next = add nsw i64 %lsr.iv, -8
  %98 = icmp eq i64 %48, %lsr.iv.next
  br i1 %98, label %99, label %87, !llvm.loop !17

Rust

Here is the hot loop according to the Linux Perf:

Rust perf data

  8.96 │370:   cmp     %rdi,%r14
       │     ↓ jbe     587
  8.81 │       cmp     %rdi,%rbp
       │     ↓ jbe     581
  8.59 │       mov     (%rbx,%rdi,4),%r8d
  8.81 │       add     %r8d,(%r15,%rdi,4)
 16.53 │       lea     -0x1(%rdi),%r8
  8.92 │       inc     %rdi
  7.97 │       cmp     $0x1,%rdi
 15.84 │       mov     %r8,%rdi
 15.52 │     ↑ jg      370

This also corresponds to the same nested loop in the second loop in the source code.

Rust LLVM IR (the frontend output)

bb16.us:                                          ; preds = %bb44.us, %bb51.us
  %indvars.iv124 = phi i64 [ %73, %bb44.us ], [ %indvars.iv.next125, %bb51.us ]
  %indvars.iv.next125 = add nsw i64 %indvars.iv124, -1
  %_86.us = icmp ugt i64 %46, %indvars.iv.next125
  br i1 %_86.us, label %bb50.us, label %panic10.invoke

bb50.us:                                          ; preds = %bb16.us
  %_92.us = icmp ugt i64 %56, %indvars.iv.next125
  br i1 %_92.us, label %bb51.us, label %panic10.invoke

bb51.us:                                          ; preds = %bb50.us
  %_33.us = getelementptr inbounds i32, ptr %_24.i.i, i64 %indvars.iv.next125
  %_32.us = load i32, ptr %_33.us, align 4, !noundef !4
  %_35.us = getelementptr inbounds i32, ptr %_24.i.i54, i64 %indvars.iv.next125
  %75 = load i32, ptr %_35.us, align 4, !noundef !4
  %76 = add i32 %75, %_32.us
  store i32 %76, ptr %_35.us, align 4
  %_77.us = icmp sgt i64 %indvars.iv124, 1
  br i1 %_77.us, label %bb16.us, label %bb15.bb12.loopexit_crit_edge.us

It looks like the difference is caused by a lack of loop vectorization in the Rust version.

A June 2025 update on the performance difference

I filed an issue at the Rust repo about this performance difference. According to the conversation, if the second loop is rewritten to use iterators as below, the loop vectorization occurs and the performance becomes on par with the C++ version.

A version of the second loop that uses iterators:

#![allow(unused)]
fn main() {
    for _k in 0..1000 {
        y.iter_mut().zip(x.iter()).rev().for_each(|(y, x)| {
            *y += *x
        })
    }
}

I think that the Rust compiler could be improved to recognize the index-based version of the loop and perform loop vectorization.