matrix
Full source code
C++
// -*- mode: c++ -*-
// $Id$
// http://www.bagley.org/~doug/shootout/
#include <iostream>
#include <stdlib.h>
using namespace std;
#define SIZE 30
int **mkmatrix(int rows, int cols) {
int i, j, count = 1;
int **m = (int **) malloc(rows * sizeof(int *));
for (i=0; i<rows; i++) {
m[i] = (int *) malloc(cols * sizeof(int));
for (j=0; j<cols; j++) {
m[i][j] = count++;
}
}
return(m);
}
void zeromatrix(int rows, int cols, int **m) {
int i, j;
for (i=0; i<rows; i++)
for (j=0; j<cols; j++)
m[i][j] = 0;
}
void freematrix(int rows, int **m) {
while (--rows > -1) { free(m[rows]); }
free(m);
}
int **mmult(int rows, int cols, int **m1, int **m2, int **m3) {
int i, j, k, val;
for (i=0; i<rows; i++) {
for (j=0; j<cols; j++) {
val = 0;
for (k=0; k<cols; k++) {
val += m1[i][k] * m2[k][j];
}
m3[i][j] = val;
}
}
return(m3);
}
int main(int argc, char *argv[]) {
#ifdef SMALL_PROBLEM_SIZE
#define LENGTH 10000
#else
#define LENGTH 100000
#endif
int i, n = ((argc == 2) ? atoi(argv[1]) : LENGTH);
int **m1 = mkmatrix(SIZE, SIZE);
int **m2 = mkmatrix(SIZE, SIZE);
int **mm = mkmatrix(SIZE, SIZE);
for (i=0; i<n; i++) {
mm = mmult(SIZE, SIZE, m1, m2, mm);
}
cout << mm[0][0] << " " << mm[2][3] << " " << mm[3][2] << " " << mm[4][4] << endl;
freematrix(SIZE, m1);
freematrix(SIZE, m2);
freematrix(SIZE, mm);
return(0);
}
Rust
// Adapted from https://github.com/llvm/llvm-test-suite and // http://www.bagley.org/~doug/shootout/ use std::env; const SIZE: usize = 30; #[cfg(feature = "small_problem_size")] const LENGTH: i32 = 10000; #[cfg(not(feature = "small_problem_size"))] const LENGTH: i32 = 100000; fn mkmatrix(rows: usize, cols: usize) -> Vec::<Vec::<i32>> { let mut count = 1; let mut m = Vec::<Vec::<i32>>::new(); m.resize(rows, Default::default()); for i in 0..rows { m[i].resize(cols, Default::default()); for j in 0..cols { m[i][j] = count; count += 1; } } return m; } fn mmult(rows: usize, cols: usize, m1: &Vec::<Vec::<i32>>, m2: &Vec::<Vec::<i32>>, m3: &mut Vec::<Vec::<i32>>) { for i in 0..rows { for j in 0..cols { let mut val = 0; for k in 0..cols { val += m1[i][k] * m2[k][j]; } m3[i][j] = val; } } } fn main() { let mut args = env::args(); let n = if args.len() == 2 { args.nth(1).unwrap().parse::<i32>().unwrap() } else { LENGTH }; let m1 = mkmatrix(SIZE, SIZE); let m2 = mkmatrix(SIZE, SIZE); let mut mm = mkmatrix(SIZE, SIZE); for _i in 0..n { mmult(SIZE, SIZE, &m1, &m2, &mut mm); } println!("{} {} {} {}", mm[0][0], mm[2][3], mm[3][2], mm[4][4]); }
Porting notes
Using nested Vec
s to represent a matrix
C++
int **mkmatrix(int rows, int cols) {
int i, j, count = 1;
int **m = (int **) malloc(rows * sizeof(int *));
for (i=0; i<rows; i++) {
m[i] = (int *) malloc(cols * sizeof(int));
for (j=0; j<cols; j++) {
m[i][j] = count++;
}
}
return(m);
}
Rust
#![allow(unused)] fn main() { fn mkmatrix(rows: usize, cols: usize) -> Vec::<Vec::<i32>> { let mut count = 1; let mut m = Vec::<Vec::<i32>>::new(); m.resize(rows, Default::default()); for i in 0..rows { m[i].resize(cols, Default::default()); for j in 0..cols { m[i][j] = count; count += 1; } } return m; } }
We use a two-level, nested Vec
data structure where the first level is for the rows and the second level is for the columns. That is similar to the data structure that uses arrays in the C++ version. While it'd be possible to use raw i32 arrays in Rust, it is more idiomatic to use Vec
treating Vec
as the safe equivalent of raw arrays.
Passing matrices to the matrix multiplication function using references
C++
int **m1 = mkmatrix(SIZE, SIZE);
int **m2 = mkmatrix(SIZE, SIZE);
int **mm = mkmatrix(SIZE, SIZE);
for (i=0; i<n; i++) {
mm = mmult(SIZE, SIZE, m1, m2, mm);
}
Rust
#![allow(unused)] fn main() { let m1 = mkmatrix(SIZE, SIZE); let m2 = mkmatrix(SIZE, SIZE); let mut mm = mkmatrix(SIZE, SIZE); for _i in 0..n { mmult(SIZE, SIZE, &m1, &m2, &mut mm); } }
Freeing matrices
C++
freematrix(SIZE, m1);
freematrix(SIZE, m2);
freematrix(SIZE, mm);
Rust
#![allow(unused)] fn main() { // empty }
The Rust version doesn't need to explicitly free the matrices because they are automatically dropped at the end of the main function thanks to the ownership mechanism.
Performance analysis
The benchmark results show that the Rust version is more than 2 times slower than the C++ version.
We compare the generated code between C++ and Rust.
C++
Here is the hot loop according to the Linux Perf:
C++ perfdata
5.98 │2b0: mov -0x10(%r14,%r8,8),%r10
5.93 │ mov -0x8(%r14,%r8,8),%r11
7.04 │ mov (%r10,%rdi,4),%r10d
11.66 │ imul -0x8(%rsi,%r8,4),%r10d
5.84 │ add %r9d,%r10d
5.71 │ mov (%r11,%rdi,4),%r11d
10.56 │ imul -0x4(%rsi,%r8,4),%r11d
11.00 │ mov (%r14,%r8,8),%r9
5.17 │ mov (%r9,%rdi,4),%r9d
4.77 │ imul (%rsi,%r8,4),%r9d
4.52 │ add %r11d,%r9d
4.53 │ add %r10d,%r9d
8.94 │ add $0x3,%r8
4.73 │ cmp $0x20,%r8
│ ↑ jne 2b0
C++ LLVM IR
106: ; preds = %106, %104
%107 = phi i64 [ 0, %104 ], [ %135, %106 ]
%108 = phi i32 [ 0, %104 ], [ %134, %106 ]
%109 = getelementptr inbounds nuw i32, ptr %103, i64 %107
%110 = load i32, ptr %109, align 4, !tbaa !10
%111 = getelementptr inbounds nuw ptr, ptr %39, i64 %107
%112 = load ptr, ptr %111, align 8, !tbaa !5
%113 = getelementptr inbounds nuw i32, ptr %112, i64 %105
%114 = load i32, ptr %113, align 4, !tbaa !10
%115 = mul nsw i32 %114, %110
%116 = add nsw i32 %115, %108
%117 = add nuw nsw i64 %107, 1
%118 = getelementptr inbounds nuw i32, ptr %103, i64 %117
%119 = load i32, ptr %118, align 4, !tbaa !10
%120 = getelementptr inbounds nuw ptr, ptr %39, i64 %117
%121 = load ptr, ptr %120, align 8, !tbaa !5
%122 = getelementptr inbounds nuw i32, ptr %121, i64 %105
%123 = load i32, ptr %122, align 4, !tbaa !10
%124 = mul nsw i32 %123, %119
%125 = add nsw i32 %124, %116
%126 = add nuw nsw i64 %107, 2
%127 = getelementptr inbounds nuw i32, ptr %103, i64 %126
%128 = load i32, ptr %127, align 4, !tbaa !10
%129 = getelementptr inbounds nuw ptr, ptr %39, i64 %126
%130 = load ptr, ptr %129, align 8, !tbaa !5
%131 = getelementptr inbounds nuw i32, ptr %130, i64 %105
%132 = load i32, ptr %131, align 4, !tbaa !10
%133 = mul nsw i32 %132, %128
%134 = add nsw i32 %133, %125
%135 = add nuw nsw i64 %107, 3
%136 = icmp eq i64 %135, 30
br i1 %136, label %137, label %106, !llvm.loop !22
The loop is unrolled by a factor of 3.
Rust
Here is the hot loop according to the Linux Perf:
Rust perfdata
16.54 │ 260: cmp %rbx,%rsi
│ ↓ je 5c3
5.16 │ cmp %rbx,%r8
│ ↓ je 5d2
4.99 │ mov (%r12),%r13
4.84 │ cmp %r13,%rdi
│ ↓ jae 5e1
10.44 │ mov 0x8(%rax),%r13
5.67 │ mov 0x0(%r13,%rbx,4),%r13d
5.73 │ mov -0x8(%r12),%r14
5.16 │ imul (%r14,%rdi,4),%r13d
5.51 │ inc %rbx
10.59 │ add %r13d,%r9d
5.17 │ add $0x18,%r12
17.06 │ cmp $0x1e,%rbx
│ ↑ jne 260
Rust LLVM IR
bb10.us.i: ; preds = %bb17.us.i, %bb7.us.i
%iter2.sroa.0.024.us.i = phi i64 [ 0, %bb7.us.i ], [ %_0.i13.us.i, %bb17.us.i ]
%val.sroa.0.023.us.i = phi i32 [ 0, %bb7.us.i ], [ %95, %bb17.us.i ]
%_0.i13.us.i = add nuw nsw i64 %iter2.sroa.0.024.us.i, 1
%exitcond.not.i = icmp eq i64 %iter2.sroa.0.024.us.i, %_47.us.i
br i1 %exitcond.not.i, label %panic7.i.invoke, label %bb15.us.i
bb15.us.i: ; preds = %bb10.us.i
%_48.us.i = load ptr, ptr %90, align 8, !nonnull !4, !noundef !4
%_15.us.i = getelementptr inbounds i32, ptr %_48.us.i, i64 %iter2.sroa.0.024.us.i
%_14.us.i = load i32, ptr %_15.us.i, align 4, !noundef !4
%exitcond89.not.i = icmp eq i64 %iter2.sroa.0.024.us.i, %m2.val25
br i1 %exitcond89.not.i, label %panic7.i.invoke, label %bb16.us.i
bb16.us.i: ; preds = %bb15.us.i
call void @llvm.assume(i1 %35)
%_19.us.i = getelementptr inbounds %"alloc::vec::Vec<i32>", ptr %m2.val, i64 %iter2.sroa.0.024.us.i
%93 = getelementptr inbounds i8, ptr %_19.us.i, i64 16
%_57.us.i = load i64, ptr %93, align 8, !noundef !4
%_60.us.i = icmp ult i64 %iter1.sroa.0.025.us.i, %_57.us.i
br i1 %_60.us.i, label %bb17.us.i, label %panic7.i.invoke
bb17.us.i: ; preds = %bb16.us.i
%94 = getelementptr inbounds i8, ptr %_19.us.i, i64 8
%_58.us.i = load ptr, ptr %94, align 8, !nonnull !4, !noundef !4
%_18.us.i = getelementptr inbounds i32, ptr %_58.us.i, i64 %iter1.sroa.0.025.us.i
%_17.us.i = load i32, ptr %_18.us.i, align 4, !noundef !4
%_13.us.i = mul i32 %_17.us.i, %_14.us.i
%95 = add i32 %_13.us.i, %val.sroa.0.023.us.i
%exitcond90.not.i = icmp eq i64 %_0.i13.us.i, 30
br i1 %exitcond90.not.i, label %bb12.us.i, label %bb10.us.i
The key differences from the C++ version are the following.
First, the mmult
function isn't inlined into the main function. That makes it non-trivial to constant-propgate the Vec
sizes as compile-time constant 30. If intraprocedural constant propagation was implemented indepedent of inlining, it would do but it doesn't seem to be there.
Second, the bounds checks are not elided. The bounds checks could be hoisted out of the loop (loop invariant code motion) and the special-cased, bounds-check-free version of the innermost loop could be generated (loop versioning). The LLVM backend probably has some form of these implemented but it doesn't seem to trigger here.
Third, loop unrolling has not been applied. I suspect that because of the above two lack of optimizations, the loop doesn't look simple enough for loop unrolling to kick in.
Note these are differences in the frontends. The Clang frontend applied these three optimiations for the C++ version. But the Rustc frontend did not for the Rust version. While the LLVM backend could also apply these in theory but it doesn't seem to at least in the given environment.
A June 2025 update on the performance difference
I filed an issue at
the Rust repo about this performance difference. According to the
conversation, if the program uses arrays (of compile-time constant
size), as opposed to vectors to represent the matrices, where the
arrays are heap-allocated by using Box
's to be equivalent to the C++
version, as below, similar set of optimizations are performanced and
the performance gets on par with the C++ version.
A version that uses heap allocated arrays instead of vectors:
// Adapted from https://github.com/llvm/llvm-test-suite and // http://www.bagley.org/~doug/shootout/ use std::env; use std::convert::TryInto; const SIZE: usize = 30; #[cfg(feature = "small_problem_size")] const LENGTH: i32 = 10000; #[cfg(not(feature = "small_problem_size"))] const LENGTH: i32 = 100000; fn mkmatrix(rows: usize, cols: usize) -> Box::<[Box::<[i32; SIZE]>; SIZE]> { let mut count = 1; let mut row_vec = Vec::<Box<[i32; SIZE]>>::new(); for _i in 0..rows { let mut row: [i32; SIZE] = [0; SIZE]; for j in 0..cols { row[j] = count; count += 1; } row_vec.push(Box::new(row)); } let m: [Box<[i32; SIZE]>; SIZE] = row_vec.try_into().unwrap_or_else( |v: Vec<Box<[i32; SIZE]>>| panic!("Expected a Vec of length {} but it was {}", SIZE, v.len())); return Box::new(m); } fn mmult(rows: usize, cols: usize, m1: &Box::<[Box::<[i32; SIZE]>; SIZE]>, m2: &Box::<[Box::<[i32; SIZE]>; SIZE]>, m3: &mut Box::<[Box::<[i32; SIZE]>; SIZE]>) { for i in 0..rows { for j in 0..cols { let mut val = 0; for k in 0..cols { val += m1[i][k] * m2[k][j]; } m3[i][j] = val; } } } fn main() { let mut args = env::args(); let n = if args.len() == 2 { args.nth(1).unwrap().parse::<i32>().unwrap() } else { LENGTH }; let m1 = mkmatrix(SIZE, SIZE); let m2 = mkmatrix(SIZE, SIZE); let mut mm = mkmatrix(SIZE, SIZE); for _i in 0..n { mmult(SIZE, SIZE, &m1, &m2, &mut mm); } println!("{} {} {} {}", mm[0][0], mm[2][3], mm[3][2], mm[4][4]); }
But this may not be considered equivalent to the C++ version in that
the array version of mmult
in Rust is tied to the specific size
while the Rust vector version or the C++ version work on any sizes.
I think that the Rust compiler could be improved to similarly optimize the vector version of the code.