# compute C = A * B, using naive matrix-multiplication algorithm, # with a pre-allocated output array C. ("!" is a Julia convention # for functions that modify their arguments.) function matmul!(C, A, B) m,n = size(A) n,p = size(B) size(C) == (m,p) || error("incorrect dimensions ", size(C), " ≠ $m × $p") for i = 1:m for k = 1:p c = zero(eltype(C)) for j = 1:n @inbounds c += A[i,j] * B[j,k] end @inbounds C[i,k] = c end end return C end # a wrapper that allocates C of an appropriate type matmul(A, B) = matmul!(Array(promote_type(eltype(A), eltype(B)), size(A,1), size(B,2)), A, B) # correctness check: A = rand(5,6) B = rand(6,7) norm(matmul(A,B) - A * B) # for benchmarking, use only single-threaded BLAS: blas_set_num_threads(1) N = round(Int, logspace(1, 3, 50)) # 50 sizes from 10 to 1000 # alternatively, use N = 10:1000 to see some interesting patterns due to cache associativity etc. t = Float64[] t0 = Float64[] for n in N A = zeros(n,n) B = zeros(n,n) # preallocate output C so that allocation is not included in timing C = zeros(n,n) push!(t, @elapsed matmul!(C,A,B)) push!(t0, @elapsed A_mul_B!(C,A,B)) println("finished n = $n: slowdown of ", t[end]/t0[end]) end using PyPlot PyPlot.svg(true) # SVG output for nice plots plot(N, 2N.^3 ./ t * 1e-9, "rs-") plot(N, 2N.^3 ./ t0 * 1e-9 / 4, "b.-") ylabel(L"gigaflops $2n^3/t$") xlabel(L"matrix size $n$") legend(["naive matmul", "BLAS matmul / 4"], loc="lower left") # C implementation: Cmatmul = """ void Cmatmul(int m, int n, int p, double *C, double *A, double *B) { int i, j, k; for (i = 0; i < m; ++i) for (j = 0; j < p; ++j) { double c = 0.0; for (k = 0; k < n; ++k) c += A[i + m*k] * B[k + n*j]; C[i + m*j] = c; } } """ # compile to a shared library by piping Cmatmul to gcc: # (only works if you have gcc installed) const Clib = tempname() open(`gcc -fPIC -O3 -xc -shared -o $(Clib * "." * Base.Sys.dlext) -`, "w") do f print(f, Cmatmul) end # define a Julia cmatmul! function that simply calls the C code in the shared library we compiled function cmatmul!(C, A, B) m,n = size(A) n,p = size(B) size(C) == (m,p) || error("incorrect dimensions ", size(C), " ≠ $m × $p") ccall(("Cmatmul", Clib), Void, (Cint, Cint, Cint, Ptr{Float64}, Ptr{Float64}, Ptr{Float64}), m, n, p, C, A, B) return C end cmatmul(A, B) = cmatmul!(Array(promote_type(eltype(A), eltype(B)), size(A,1), size(B,2)), A, B) # correctness check: A = rand(5,6) B = rand(6,7) norm(cmatmul(A,B) - A * B) tc = Float64[] for n in N A = zeros(n,n) B = zeros(n,n) # preallocate output C so that allocation is not included in timing C = zeros(n,n) push!(tc, @elapsed cmatmul!(C,A,B)) println("finished n = $n: speedup of ", tc[end]/t[length(tc)]) end function add_matmul_rec!(m,n,p, i0,j0,k0, C,A,B) if m+n+p <= 64 # base case: naive matmult for sufficiently large matrices for i = 1:m for k = 1:p c = zero(eltype(C)) for j = 1:n @inbounds c += A[i0+i,j0+j] * B[j0+j,k0+k] end @inbounds C[i0+i,k0+k] += c end end else m2 = m ÷ 2; n2 = n ÷ 2; p2 = p ÷ 2 add_matmul_rec!(m2, n2, p2, i0, j0, k0, C, A, B) add_matmul_rec!(m-m2, n2, p2, i0+m2, j0, k0, C, A, B) add_matmul_rec!(m2, n-n2, p2, i0, j0+n2, k0, C, A, B) add_matmul_rec!(m2, n2, p-p2, i0, j0, k0+p2, C, A, B) add_matmul_rec!(m-m2, n-n2, p2, i0+m2, j0+n2, k0, C, A, B) add_matmul_rec!(m2, n-n2, p-p2, i0, j0+n2, k0+p2, C, A, B) add_matmul_rec!(m-m2, n2, p-p2, i0+m2, j0, k0+p2, C, A, B) add_matmul_rec!(m-m2, n-n2, p-p2, i0+m2, j0+n2, k0+p2, C, A, B) end return C end function matmul_rec!(C, A, B) m,n = size(A) n,p = size(B) size(C) == (m,p) || error("incorrect dimensions ", size(C), " ≠ $m × $p") fill!(C, 0) return add_matmul_rec!(m,n,p, 0,0,0, C,A,B) end matmul_rec(A, B) = matmul_rec!(Array(promote_type(eltype(A), eltype(B)), size(A,1), size(B,2)), A, B) # correctness check: A = rand(50,60) B = rand(60,70) norm(matmul_rec(A,B) - A * B) tco = Float64[] for n in N A = zeros(n,n) B = zeros(n,n) # preallocate output C so that allocation is not included in timing C = zeros(n,n) push!(tco, @elapsed matmul_rec!(C,A,B)) println("finished n = $n: slowdown of ", tco[end]/t0[length(tc)]) end plot(N, 2N.^3 ./ t * 1e-9, "rs-") plot(N, 2N.^3 ./ tco * 1e-9, "ko-") plot(N, 2N.^3 ./ t0 * 1e-9 / 4, "b.-") ylabel(L"gigaflops $2n^3/t$") xlabel(L"matrix size $n$") legend(["naive matmul", "cache-oblivious", "BLAS matmul / 4"], loc="lower left")