# compute C = A * B, using naive matrix-multiplication algorithm,
# with a pre-allocated output array C.  ("!" is a Julia convention
# for functions that modify their arguments.)
function matmul!(C, A, B)
    m,n = size(A)
    n,p = size(B)
    size(C) == (m,p) || error("incorrect dimensions ", size(C), " ≠ $m × $p")
    for i = 1:m
        for k = 1:p
            c = zero(eltype(C))
            for j = 1:n
                @inbounds c += A[i,j] * B[j,k]
            end
            @inbounds C[i,k] = c
        end
    end
    return C
end

# a wrapper that allocates C of an appropriate type
matmul(A, B) = matmul!(Array(promote_type(eltype(A), eltype(B)),
                             size(A,1), size(B,2)),
                       A, B)

# correctness check:
A = rand(5,6)
B = rand(6,7)
norm(matmul(A,B) - A * B)

# for benchmarking, use only single-threaded BLAS:
blas_set_num_threads(1)

N = round(Int, logspace(1, 3, 50))  # 50 sizes from 10 to 1000
# alternatively, use N = 10:1000 to see some interesting patterns due to cache associativity etc.
t = Float64[]
t0 = Float64[]
for n in N
    A = zeros(n,n)
    B = zeros(n,n)
    # preallocate output C so that allocation is not included in timing
    C = zeros(n,n)
    push!(t, @elapsed matmul!(C,A,B))
    push!(t0, @elapsed A_mul_B!(C,A,B))
    println("finished n = $n: slowdown of ", t[end]/t0[end])
end

using PyPlot
PyPlot.svg(true) # SVG output for nice plots

plot(N, 2N.^3 ./ t * 1e-9, "rs-")
plot(N, 2N.^3 ./ t0 * 1e-9 / 4, "b.-")
ylabel(L"gigaflops $2n^3/t$")
xlabel(L"matrix size $n$")
legend(["naive matmul", "BLAS matmul / 4"], loc="lower left")

# C implementation:
Cmatmul = """
void Cmatmul(int m, int n, int p, double *C, double *A, double *B)
{
    int i, j, k;
    for (i = 0; i < m; ++i)
        for (j = 0; j < p; ++j) {
            double c = 0.0;
            for (k = 0; k < n; ++k)
                c += A[i + m*k] * B[k + n*j];
            C[i + m*j] = c;
    }
}
"""
# compile to a shared library by piping Cmatmul to gcc:
# (only works if you have gcc installed)
const Clib = tempname()
open(`gcc -fPIC -O3 -xc -shared -o $(Clib * "." * Base.Sys.dlext) -`, "w") do f
    print(f, Cmatmul)
end

# define a Julia cmatmul! function that simply calls the C code in the shared library we compiled
function cmatmul!(C, A, B)
    m,n = size(A)
    n,p = size(B)
    size(C) == (m,p) || error("incorrect dimensions ", size(C), " ≠ $m × $p")
    ccall(("Cmatmul", Clib), Void, (Cint, Cint, Cint, Ptr{Float64}, Ptr{Float64}, Ptr{Float64}),
          m, n, p, C, A, B)
    return C
end
cmatmul(A, B) = cmatmul!(Array(promote_type(eltype(A), eltype(B)),
                               size(A,1), size(B,2)),
                         A, B)

# correctness check:
A = rand(5,6)
B = rand(6,7)
norm(cmatmul(A,B) - A * B)

tc = Float64[]
for n in N
    A = zeros(n,n)
    B = zeros(n,n)
    # preallocate output C so that allocation is not included in timing
    C = zeros(n,n)
    push!(tc, @elapsed cmatmul!(C,A,B))
    println("finished n = $n: speedup of ", tc[end]/t[length(tc)])
end

function add_matmul_rec!(m,n,p, i0,j0,k0, C,A,B)
    if m+n+p <= 64   # base case: naive matmult for sufficiently large matrices
        for i = 1:m
            for k = 1:p
                c = zero(eltype(C))
                for j = 1:n
                    @inbounds c += A[i0+i,j0+j] * B[j0+j,k0+k]
                end
                @inbounds C[i0+i,k0+k] += c
            end
        end
    else
        m2 = m ÷ 2; n2 = n ÷ 2; p2 = p ÷ 2
        add_matmul_rec!(m2, n2, p2, i0, j0, k0, C, A, B)
        
        add_matmul_rec!(m-m2, n2, p2, i0+m2, j0, k0, C, A, B)
        add_matmul_rec!(m2, n-n2, p2, i0, j0+n2, k0, C, A, B)
        add_matmul_rec!(m2, n2, p-p2, i0, j0, k0+p2, C, A, B)
        
        add_matmul_rec!(m-m2, n-n2, p2, i0+m2, j0+n2, k0, C, A, B)
        add_matmul_rec!(m2, n-n2, p-p2, i0, j0+n2, k0+p2, C, A, B)
        add_matmul_rec!(m-m2, n2, p-p2, i0+m2, j0, k0+p2, C, A, B)
        
        add_matmul_rec!(m-m2, n-n2, p-p2, i0+m2, j0+n2, k0+p2, C, A, B)
    end
    return C
end

function matmul_rec!(C, A, B)
    m,n = size(A)
    n,p = size(B)
    size(C) == (m,p) || error("incorrect dimensions ", size(C), " ≠ $m × $p")
    fill!(C, 0)
    return add_matmul_rec!(m,n,p, 0,0,0, C,A,B)
end


matmul_rec(A, B) = matmul_rec!(Array(promote_type(eltype(A), eltype(B)),
                                     size(A,1), size(B,2)),
                               A, B)

# correctness check:
A = rand(50,60)
B = rand(60,70)
norm(matmul_rec(A,B) - A * B)

tco = Float64[]
for n in N
    A = zeros(n,n)
    B = zeros(n,n)
    # preallocate output C so that allocation is not included in timing
    C = zeros(n,n)
    push!(tco, @elapsed matmul_rec!(C,A,B))
    println("finished n = $n: slowdown of ", tco[end]/t0[length(tc)])
end

plot(N, 2N.^3 ./ t * 1e-9, "rs-")
plot(N, 2N.^3 ./ tco * 1e-9, "ko-")
plot(N, 2N.^3 ./ t0 * 1e-9 / 4, "b.-")
ylabel(L"gigaflops $2n^3/t$")
xlabel(L"matrix size $n$")
legend(["naive matmul", "cache-oblivious", "BLAS matmul / 4"], loc="lower left")