matrix

Full source code

C++

// -*- mode: c++ -*-
// $Id$
// http://www.bagley.org/~doug/shootout/

#include <iostream>
#include <stdlib.h>

using namespace std;

#define SIZE 30

int **mkmatrix(int rows, int cols) {
    int i, j, count = 1;
    int **m = (int **) malloc(rows * sizeof(int *));
    for (i=0; i<rows; i++) {
        m[i] = (int *) malloc(cols * sizeof(int));
        for (j=0; j<cols; j++) {
            m[i][j] = count++;
        }
    }
    return(m);
}

void zeromatrix(int rows, int cols, int **m) {
    int i, j;
    for (i=0; i<rows; i++)
        for (j=0; j<cols; j++)
            m[i][j] = 0;
}

void freematrix(int rows, int **m) {
    while (--rows > -1) { free(m[rows]); }
    free(m);
}

int **mmult(int rows, int cols, int **m1, int **m2, int **m3) {
    int i, j, k, val;
    for (i=0; i<rows; i++) {
        for (j=0; j<cols; j++) {
            val = 0;
            for (k=0; k<cols; k++) {
                val += m1[i][k] * m2[k][j];
            }
            m3[i][j] = val;
        }
    }
    return(m3);
}

int main(int argc, char *argv[]) {
#ifdef SMALL_PROBLEM_SIZE
#define LENGTH 10000
#else
#define LENGTH 100000
#endif
    int i, n = ((argc == 2) ? atoi(argv[1]) : LENGTH);

    int **m1 = mkmatrix(SIZE, SIZE);
    int **m2 = mkmatrix(SIZE, SIZE);
    int **mm = mkmatrix(SIZE, SIZE);

    for (i=0; i<n; i++) {
        mm = mmult(SIZE, SIZE, m1, m2, mm);
    }
    cout << mm[0][0] << " " << mm[2][3] << " " << mm[3][2] << " " << mm[4][4] << endl;

    freematrix(SIZE, m1);
    freematrix(SIZE, m2);
    freematrix(SIZE, mm);
    return(0);
}

Rust

// Adapted from https://github.com/llvm/llvm-test-suite and
// http://www.bagley.org/~doug/shootout/
use std::env;

const SIZE: usize = 30;

#[cfg(feature = "small_problem_size")]
const LENGTH: i32 = 10000;

#[cfg(not(feature = "small_problem_size"))]
const LENGTH: i32 = 100000;

fn mkmatrix(rows: usize, cols: usize) -> Vec::<Vec::<i32>> {
    let mut count = 1;
    let mut m = Vec::<Vec::<i32>>::new();
    m.resize(rows, Default::default());
    for i in 0..rows {
        m[i].resize(cols, Default::default());
        for j in 0..cols {
            m[i][j] = count;
            count += 1;
        }
    }
    return m;
}

fn mmult(rows: usize, cols: usize, m1: &Vec::<Vec::<i32>>,
         m2: &Vec::<Vec::<i32>>, m3: &mut Vec::<Vec::<i32>>) {
    for i in 0..rows {
        for j in 0..cols {
            let mut val = 0;
            for k in 0..cols {
                val += m1[i][k] * m2[k][j];
            }
            m3[i][j] = val;
        }
    }
}

fn main() {
    let mut args = env::args();
    let n = if args.len() == 2 {
        args.nth(1).unwrap().parse::<i32>().unwrap()
    } else {
        LENGTH
    };

    let m1 = mkmatrix(SIZE, SIZE);
    let m2 = mkmatrix(SIZE, SIZE);
    let mut mm = mkmatrix(SIZE, SIZE);

    for _i in 0..n {
        mmult(SIZE, SIZE, &m1, &m2, &mut mm);
    }
    println!("{} {} {} {}", mm[0][0], mm[2][3], mm[3][2], mm[4][4]);
}

Porting notes

Using nested Vecs to represent a matrix

C++

int **mkmatrix(int rows, int cols) {
    int i, j, count = 1;
    int **m = (int **) malloc(rows * sizeof(int *));
    for (i=0; i<rows; i++) {
        m[i] = (int *) malloc(cols * sizeof(int));
        for (j=0; j<cols; j++) {
            m[i][j] = count++;
        }
    }
    return(m);
}

Rust

#![allow(unused)]
fn main() {
fn mkmatrix(rows: usize, cols: usize) -> Vec::<Vec::<i32>> {
    let mut count = 1;
    let mut m = Vec::<Vec::<i32>>::new();
    m.resize(rows, Default::default());
    for i in 0..rows {
        m[i].resize(cols, Default::default());
        for j in 0..cols {
            m[i][j] = count;
            count += 1;
        }
    }
    return m;
}
}

We use a two-level, nested Vec data structure where the first level is for the rows and the second level is for the columns. That is similar to the data structure that uses arrays in the C++ version. While it'd be possible to use raw i32 arrays in Rust, it is more idiomatic to use Vec treating Vec as the safe equivalent of raw arrays.

Passing matrices to the matrix multiplication function using references

C++

    int **m1 = mkmatrix(SIZE, SIZE);
    int **m2 = mkmatrix(SIZE, SIZE);
    int **mm = mkmatrix(SIZE, SIZE);

    for (i=0; i<n; i++) {
        mm = mmult(SIZE, SIZE, m1, m2, mm);
    }

Rust

#![allow(unused)]
fn main() {
    let m1 = mkmatrix(SIZE, SIZE);
    let m2 = mkmatrix(SIZE, SIZE);
    let mut mm = mkmatrix(SIZE, SIZE);

    for _i in 0..n {
        mmult(SIZE, SIZE, &m1, &m2, &mut mm);
    }
}

Freeing matrices

C++

    freematrix(SIZE, m1);
    freematrix(SIZE, m2);
    freematrix(SIZE, mm);

Rust

#![allow(unused)]
fn main() {
// empty
}

The Rust version doesn't need to explicitly free the matrices because they are automatically dropped at the end of the main function thanks to the ownership mechanism.

Performance analysis

The benchmark results show that the Rust version is more than 2 times slower than the C++ version.

We compare the generated code between C++ and Rust.

C++

Here is the hot loop according to the Linux Perf:

C++ perfdata

  5.98 │2b0:   mov    -0x10(%r14,%r8,8),%r10
  5.93 │       mov    -0x8(%r14,%r8,8),%r11
  7.04 │       mov    (%r10,%rdi,4),%r10d
 11.66 │       imul   -0x8(%rsi,%r8,4),%r10d
  5.84 │       add    %r9d,%r10d
  5.71 │       mov    (%r11,%rdi,4),%r11d
 10.56 │       imul   -0x4(%rsi,%r8,4),%r11d
 11.00 │       mov    (%r14,%r8,8),%r9
  5.17 │       mov    (%r9,%rdi,4),%r9d
  4.77 │       imul   (%rsi,%r8,4),%r9d
  4.52 │       add    %r11d,%r9d
  4.53 │       add    %r10d,%r9d
  8.94 │       add    $0x3,%r8
  4.73 │       cmp    $0x20,%r8
       │     ↑ jne    2b0

C++ LLVM IR

106:                                              ; preds = %106, %104
  %107 = phi i64 [ 0, %104 ], [ %135, %106 ]
  %108 = phi i32 [ 0, %104 ], [ %134, %106 ]
  %109 = getelementptr inbounds nuw i32, ptr %103, i64 %107
  %110 = load i32, ptr %109, align 4, !tbaa !10
  %111 = getelementptr inbounds nuw ptr, ptr %39, i64 %107
  %112 = load ptr, ptr %111, align 8, !tbaa !5
  %113 = getelementptr inbounds nuw i32, ptr %112, i64 %105
  %114 = load i32, ptr %113, align 4, !tbaa !10
  %115 = mul nsw i32 %114, %110
  %116 = add nsw i32 %115, %108
  %117 = add nuw nsw i64 %107, 1
  %118 = getelementptr inbounds nuw i32, ptr %103, i64 %117
  %119 = load i32, ptr %118, align 4, !tbaa !10
  %120 = getelementptr inbounds nuw ptr, ptr %39, i64 %117
  %121 = load ptr, ptr %120, align 8, !tbaa !5
  %122 = getelementptr inbounds nuw i32, ptr %121, i64 %105
  %123 = load i32, ptr %122, align 4, !tbaa !10
  %124 = mul nsw i32 %123, %119
  %125 = add nsw i32 %124, %116
  %126 = add nuw nsw i64 %107, 2
  %127 = getelementptr inbounds nuw i32, ptr %103, i64 %126
  %128 = load i32, ptr %127, align 4, !tbaa !10
  %129 = getelementptr inbounds nuw ptr, ptr %39, i64 %126
  %130 = load ptr, ptr %129, align 8, !tbaa !5
  %131 = getelementptr inbounds nuw i32, ptr %130, i64 %105
  %132 = load i32, ptr %131, align 4, !tbaa !10
  %133 = mul nsw i32 %132, %128
  %134 = add nsw i32 %133, %125
  %135 = add nuw nsw i64 %107, 3
  %136 = icmp eq i64 %135, 30
  br i1 %136, label %137, label %106, !llvm.loop !22

The loop is unrolled by a factor of 3.

Rust

Here is the hot loop according to the Linux Perf:

Rust perfdata

 16.54 │ 260:   cmp    %rbx,%rsi
       │      ↓ je     5c3
  5.16 │        cmp    %rbx,%r8
       │      ↓ je     5d2
  4.99 │        mov    (%r12),%r13
  4.84 │        cmp    %r13,%rdi
       │      ↓ jae    5e1
 10.44 │        mov    0x8(%rax),%r13
  5.67 │        mov    0x0(%r13,%rbx,4),%r13d
  5.73 │        mov    -0x8(%r12),%r14
  5.16 │        imul   (%r14,%rdi,4),%r13d
  5.51 │        inc    %rbx
 10.59 │        add    %r13d,%r9d
  5.17 │        add    $0x18,%r12
 17.06 │        cmp    $0x1e,%rbx
       │      ↑ jne    260

Rust LLVM IR

bb10.us.i:                                        ; preds = %bb17.us.i, %bb7.us.i
  %iter2.sroa.0.024.us.i = phi i64 [ 0, %bb7.us.i ], [ %_0.i13.us.i, %bb17.us.i ]
  %val.sroa.0.023.us.i = phi i32 [ 0, %bb7.us.i ], [ %95, %bb17.us.i ]
  %_0.i13.us.i = add nuw nsw i64 %iter2.sroa.0.024.us.i, 1
  %exitcond.not.i = icmp eq i64 %iter2.sroa.0.024.us.i, %_47.us.i
  br i1 %exitcond.not.i, label %panic7.i.invoke, label %bb15.us.i

bb15.us.i:                                        ; preds = %bb10.us.i
  %_48.us.i = load ptr, ptr %90, align 8, !nonnull !4, !noundef !4
  %_15.us.i = getelementptr inbounds i32, ptr %_48.us.i, i64 %iter2.sroa.0.024.us.i
  %_14.us.i = load i32, ptr %_15.us.i, align 4, !noundef !4
  %exitcond89.not.i = icmp eq i64 %iter2.sroa.0.024.us.i, %m2.val25
  br i1 %exitcond89.not.i, label %panic7.i.invoke, label %bb16.us.i

bb16.us.i:                                        ; preds = %bb15.us.i
  call void @llvm.assume(i1 %35)
  %_19.us.i = getelementptr inbounds %"alloc::vec::Vec<i32>", ptr %m2.val, i64 %iter2.sroa.0.024.us.i
  %93 = getelementptr inbounds i8, ptr %_19.us.i, i64 16
  %_57.us.i = load i64, ptr %93, align 8, !noundef !4
  %_60.us.i = icmp ult i64 %iter1.sroa.0.025.us.i, %_57.us.i
  br i1 %_60.us.i, label %bb17.us.i, label %panic7.i.invoke

bb17.us.i:                                        ; preds = %bb16.us.i
  %94 = getelementptr inbounds i8, ptr %_19.us.i, i64 8
  %_58.us.i = load ptr, ptr %94, align 8, !nonnull !4, !noundef !4
  %_18.us.i = getelementptr inbounds i32, ptr %_58.us.i, i64 %iter1.sroa.0.025.us.i
  %_17.us.i = load i32, ptr %_18.us.i, align 4, !noundef !4
  %_13.us.i = mul i32 %_17.us.i, %_14.us.i
  %95 = add i32 %_13.us.i, %val.sroa.0.023.us.i
  %exitcond90.not.i = icmp eq i64 %_0.i13.us.i, 30
  br i1 %exitcond90.not.i, label %bb12.us.i, label %bb10.us.i

The key differences from the C++ version are the following.

First, the mmult function isn't inlined into the main function. That makes it non-trivial to constant-propgate the Vec sizes as compile-time constant 30. If intraprocedural constant propagation was implemented indepedent of inlining, it would do but it doesn't seem to be there.

Second, the bounds checks are not elided. The bounds checks could be hoisted out of the loop (loop invariant code motion) and the special-cased, bounds-check-free version of the innermost loop could be generated (loop versioning). The LLVM backend probably has some form of these implemented but it doesn't seem to trigger here.

Third, loop unrolling has not been applied. I suspect that because of the above two lack of optimizations, the loop doesn't look simple enough for loop unrolling to kick in.

Note these are differences in the frontends. The Clang frontend applied these three optimiations for the C++ version. But the Rustc frontend did not for the Rust version. While the LLVM backend could also apply these in theory but it doesn't seem to at least in the given environment.

A June 2025 update on the performance difference

I filed an issue at the Rust repo about this performance difference. According to the conversation, if the program uses arrays (of compile-time constant size), as opposed to vectors to represent the matrices, where the arrays are heap-allocated by using Box's to be equivalent to the C++ version, as below, similar set of optimizations are performanced and the performance gets on par with the C++ version.

A version that uses heap allocated arrays instead of vectors:

// Adapted from https://github.com/llvm/llvm-test-suite and
// http://www.bagley.org/~doug/shootout/

use std::env;
use std::convert::TryInto;

const SIZE: usize = 30;

#[cfg(feature = "small_problem_size")]
const LENGTH: i32 = 10000;

#[cfg(not(feature = "small_problem_size"))]
const LENGTH: i32 = 100000;

fn mkmatrix(rows: usize, cols: usize) -> Box::<[Box::<[i32; SIZE]>; SIZE]> {
    let mut count = 1;
    let mut row_vec = Vec::<Box<[i32; SIZE]>>::new();
    for _i in 0..rows {
        let mut row: [i32; SIZE] = [0; SIZE];
        for j in 0..cols {
            row[j] = count;
            count += 1;
        }
        row_vec.push(Box::new(row));
    }
    let m: [Box<[i32; SIZE]>; SIZE] = row_vec.try_into().unwrap_or_else(
        |v: Vec<Box<[i32; SIZE]>>| panic!("Expected a Vec of length {} but it was {}", SIZE, v.len()));
    return Box::new(m);
}

fn mmult(rows: usize, cols: usize, m1: &Box::<[Box::<[i32; SIZE]>; SIZE]>,
         m2: &Box::<[Box::<[i32; SIZE]>; SIZE]>, m3: &mut Box::<[Box::<[i32; SIZE]>; SIZE]>) {
    for i in 0..rows {
        for j in 0..cols {
            let mut val = 0;
            for k in 0..cols {
                val += m1[i][k] * m2[k][j];
            }
            m3[i][j] = val;
        }
    }
}

fn main() {
    let mut args = env::args();
    let n = if args.len() == 2 {
        args.nth(1).unwrap().parse::<i32>().unwrap()
    } else {
        LENGTH
    };

    let m1 = mkmatrix(SIZE, SIZE);
    let m2 = mkmatrix(SIZE, SIZE);
    let mut mm = mkmatrix(SIZE, SIZE);

    for _i in 0..n {
        mmult(SIZE, SIZE, &m1, &m2, &mut mm);
    }
    println!("{} {} {} {}", mm[0][0], mm[2][3], mm[3][2], mm[4][4]);
}

But this may not be considered equivalent to the C++ version in that the array version of mmult in Rust is tied to the specific size while the Rust vector version or the C++ version work on any sizes.

I think that the Rust compiler could be improved to similarly optimize the vector version of the code.