%%file hello_world_mpi.c
#include <stdio.h>
#include <mpi.h>

int main(int argc, char *argv[]) {
    int     rank_id, ierr, num_ranks;
    double  start_time, wall_time;
    
    printf("C MPI minimal working example\n");
    
    ierr = MPI_Init(&argc, &argv);
    start_time = MPI_Wtime();
    ierr = MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);
    ierr = MPI_Comm_rank(MPI_COMM_WORLD, &rank_id);
    
    if (rank_id == 0) {
        printf("Number of available processors = %d.\n", num_ranks);
    }
    printf("\tProcess number %d branching off.\n", rank_id);
    
    wall_time = MPI_Wtime() - start_time;
    
    if (rank_id == 0) {
        printf("Elapsed wallclock time = %8.6fs.\n", wall_time);
    }
    
    MPI_Finalize();
    return 0;
}

!mpicc -o hello_world_mpi hello_world_mpi.c
!./hello_world_mpi

!mpiexec ./hello_world_mpi

!mpiexec -np 1 ./hello_world_mpi

%%file darts_pi.c
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <mpi.h>

#define N 100000
#define R 1

double uniform_rng() { return (double)rand() / (double)RAND_MAX; } // Not great but not the point of this exercise.

int main(int argc, char *argv[]) {
    int     rank_id, ierr, num_ranks;
    MPI_Status status;
    
    ierr = MPI_Init(&argc, &argv);
    ierr = MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);
    ierr = MPI_Comm_rank(MPI_COMM_WORLD, &rank_id);
    srand(time(NULL) + rank_id);
    
    //  Calculate some number of darts thrown into the circle.
    int n_circle = 0;
    double x, y;
    for (int t = 0; t < N; t++) {
        x = uniform_rng();
        y = uniform_rng();
        if (x*x + y*y < (double)R*R) n_circle++;
    }
    
    //  If this is the first process, then gather everyone's data.  Otherwise, send it to the first process.
    int total_circle[num_ranks]; // C99 Variable Length Arrays; compile with `-std=c99` or else use `malloc`.
    if (rank_id == 0) {
        total_circle[0] = n_circle;
        for (int i = 1; i < num_ranks; i++) {
            ierr = MPI_Recv(&total_circle[i], 1, MPI_INT, i, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
            //printf ("\t%d: recv %d from %d\n", rank_id, total_circle[i], i);
        }
    } else {
        ierr = MPI_Send(&n_circle, 1, MPI_INT, 0, 1, MPI_COMM_WORLD);
        //printf ("\t%d: send %d to %d\n", rank_id, n_circle, 0);
    }
    
    //  Now sum over the data and calculate the approximation of pi.
    int total = 0;
    if (rank_id == 0) {
        for (int i = 0; i < num_ranks; i++) {
            total += total_circle[i];
        }
        printf("With %d trials, the resulting approximation to pi = %f.\n", num_ranks*N, 4.0*(double)total/((double)N*(double)num_ranks));
    }
    
    MPI_Finalize();
    return 0;
}

# To run this code from within the notebook.
!mpicc -std=c99 -o darts_pi darts_pi.c
!mpiexec -n 4 ./darts_pi

%%file darts_pi.c
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <mpi.h>

#define N 100000
#define R 1

double uniform_rng() { return (double)rand() / (double)RAND_MAX; } // Not good but not the point of this exercise.

int main(int argc, char *argv[]) {
    int     rank_id, ierr, num_ranks;
    MPI_Status status;
    
    ierr = MPI_Init(&argc, &argv);
    ierr = MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);
    ierr = MPI_Comm_rank(MPI_COMM_WORLD, &rank_id);
    srand(time(NULL) + rank_id);
    
    //  Calculate some number of darts thrown into the circle.
    int n_circle = 0;
    double x, y;
    for (int t = 0; t < N; t++) {
        x = uniform_rng();
        y = uniform_rng();
        if (x*x + y*y < (double)R*R) n_circle++;
    }
    
    //  Reduce by summation over all data and output the result.
    int total;
    ierr = MPI_Reduce(&n_circle, &total, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
    if (rank_id == 0) {
        printf("With %d trials, the resulting approximation to pi = %f.\n", num_ranks*N, 4.0*(double)total/((double)N*(double)num_ranks));
    }
    
    MPI_Finalize();
    return 0;
}

# To run this code from within the notebook.
!mpicc -std=c99 -o darts_pi darts_pi.c
!mpiexec -n 4 ./darts_pi

%%file fd_charge.c
//  This is a serial version of the code for reference.
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h> //for memcpy()
#include <time.h>

#define BOUNDS 1.0

double uniform_rng() { return (double)rand() / (double)RAND_MAX; } // Not good but not the point of this exercise.
struct point_t { int x, y; double mag; };

int main(int argc, char *argv[]) {
    int N = 10;
    if (argc > 1) N = atoi(argv[1]);
    int size = N % 2 == 0 ? N : N+1;  // ensure odd N
    int iter = 10;
    if (argc > 2) iter = atoi(argv[2]);
    int out_step = 1000;
    if (argc > 3) out_step = atoi(argv[3]);
    double eps = 8.8541878176e-12; // permittivity of the material
    srand(time(NULL));
    
    //  Initialize the grid with an initial guess of zeros as well as the coordinates.
    double phi[size][size];    // C99 Variable Length Arrays; compile with `-std=c99` or else use `malloc`.
    double oldphi[size][size];
    double x[size],
           y[size];
    double dx = (2*BOUNDS)/size,
           dy = dx;
    for (int i = 0; i < size; i++) {
        x[i] = (double)i*dx - BOUNDS;
        for (int j = 0; j < size; j++) {
            if (i == 0) y[j] = (double)j*dy - BOUNDS;
            phi[i][j] = 0.0;
        }
    }
    //  Set up a few random points and magnitudes for the electrostatics problem.
    int K = 20;
    struct point_t pt_srcs[K];
    for (int k = 0; k < K; k++) {
        pt_srcs[k].x   = (int)(uniform_rng() * N);
        pt_srcs[k].y   = (int)(uniform_rng() * N);
        pt_srcs[k].mag = uniform_rng() * 2.0 - 1.0;
        printf("(%f, %f) @ %f\n", x[pt_srcs[k].x], y[pt_srcs[k].y], pt_srcs[k].mag);
    }
    
    //  Iterate forward.
    int    n_steps = 0;        // total number of steps iterated
    double inveps = 1.0 / eps; // saves a division every iteration over the square loop
    double pt_src = 0.0;       // accumulator for whether a point source is located at a specific (i,j) index site
    while (n_steps < iter) {
        memcpy(oldphi, phi, size*size);
        for (int i = 0; i < size; i++) {
            for (int j = 0; j < size; j++) {
                //  Calculate point source contributions.
                pt_src = 0;
                for (int k = 0; k < K; k++) {
                    pt_src = pt_src + ((pt_srcs[k].x == i && pt_srcs[k].y == j) ? pt_srcs[k].mag : 0.0);
                }
                phi[i][j] = 0.25*dx*dx * pt_src * inveps
                          + 0.25*(i == 0      ? 0.0 : phi[i-1][j])
                          + 0.25*(i == size-1 ? 0.0 : phi[i+1][j])
                          + 0.25*(j == 0      ? 0.0 : phi[i][j-1])
                          + 0.25*(j == size-1 ? 0.0 : phi[i][j+1]);
            }
        }
        if (n_steps % out_step == 0) {
            printf("Iteration #%d:\n", n_steps);
            printf("\tphi(%f, %f) = %24.20f\n", x[(int)(0.5*N-1)], y[(int)(0.25*N+1)], phi[(int)(0.5*N-1)][(int)(0.25*N+1)]);
        }
        n_steps++;
    }
    
    //  Write the final condition out to disk and terminate.
    printf("Terminated after %d steps.\n", n_steps);
    
    FILE* f;
    f = fopen("./data.txt", "w"); // wb -write binary
    if (f != NULL) {
        for (int i = 0; i < size; i++) {
            for (int j = 0; j < size; j++) {
                fprintf(f, "%f\t", phi[i][j]);
            }
            fprintf(f, "\n");
        }
        fclose(f);
    } else {
        //failed to create the file
    }

    return 0;
}


!mpicc -std=c99 -o fd_charge fd_charge.c
!./fd_charge 200 80000 1000

# Use Python to visualize the result quickly.
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import cm
%matplotlib inline
#mpl.rcParams['figure.figsize']=[20,20]

data = np.loadtxt('./data.txt')
grdt = np.gradient(data);
mx = 10**np.floor(np.log10(data.max()))

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20,20))
axes[0].imshow(data, cmap=cm.seismic, vmin=-mx, vmax=mx, extent=[-1,1,-1,1])
axes[1].imshow(data, cmap=cm.seismic, vmin=-mx, vmax=mx)
#axes[1].quiver(grdt[0].T, grdt[1].T, scale=5e8) #only good for N < 50
axes[1].contour(data, cmap=cm.bwr, vmin=-mx, vmax=mx, levels=np.arange(-mx,mx+1,(2*mx)/1e2))
fig.show()

%%file fd_charge.c
//  This is a parallel version of the code.
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h> //for memcpy()
#include <time.h>
#include <mpi.h>

#define BOUNDS 1.0

double uniform_rng() { return (double)rand() / (double)RAND_MAX; } // Not good but not the point of this exercise.
struct point_t { int x, y; double mag; };

int main(int argc, char *argv[]) {
    int size = 10;
    if (argc > 1) size = atoi(argv[1]);
    int iter = 10;
    if (argc > 2) iter = atoi(argv[2]);
    int out_step = 1000;
    if (argc > 3) out_step = atoi(argv[3]);
    double eps = 8.8541878176e-12; // permittivity of the material
    
    int     rank_id, ierr, num_ranks;
    MPI_Status  status;
    MPI_Request request;
    
    ierr = MPI_Init(&argc, &argv);
    ierr = MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);
    ierr = MPI_Comm_rank(MPI_COMM_WORLD, &rank_id);
    srand(time(NULL) + rank_id);
    
    //  We will restrict partitions of the domain (and thus processes) to square numbers for the sake of pedagogy.
    //  Not fast, not elegant, but we only do it once.
    if (abs(sqrt((float)num_ranks)-(int)sqrt((float)num_ranks)) > 1e-6) {
        printf("Expected square number for number of processes; actual number %f.\n", (float)abs((float)sqrt((float)num_ranks)));
        MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER);
        return 0;
    }
    
    //  Calculate the processor grid preparatory to the domain partition.  We'll make the assignment into a
    //  sqrt(N) x sqrt(N) array.  This allows easy determination of neighboring processes.
    int base = (int)abs(sqrt(num_ranks));
    int nbr_l = rank_id % base - 1 >= 0   ? rank_id - 1    : -1; //printf("%d: nbr_l = %d\n", rank_id, nbr_l);
    int nbr_r = rank_id % base + 1 < base ? rank_id + 1    : -1; //printf("%d: nbr_r = %d\n", rank_id, nbr_r);
    int nbr_t = rank_id + base < num_ranks? rank_id + base : -1; //printf("%d: nbr_t = %d\n", rank_id, nbr_t);
    int nbr_b = rank_id - base >= 0       ? rank_id - base : -1; //printf("%d: nbr_b = %d\n", rank_id, nbr_b);
    
    //  Initialize the _local_ grid with an initial guess of zeros as well as the coordinates.  We will keep each
    //  local domain the same size as the original domain, 2x2 in coordinates, and just translate them to make a
    //  larger domain---in this case, adding processes increases the domain size rather than the mesh resolution.
    double phi[size][size];    // C99 Variable Length Arrays; compile with `-std=c99` or else use `malloc`.
    double oldphi[size][size];
    double x[size],
           y[size];
    double dx = (2*BOUNDS)/size,
           dy = dx;
    double base_x = 2*BOUNDS*(rank_id % base) - BOUNDS*base,
           base_y = 2*BOUNDS*(rank_id - (rank_id % base)) / base - BOUNDS*base;
    for (int i = 0; i < size; i++) {
        x[i] = (double)i*dx + base_x;
        for (int j = 0; j < size; j++) {
            if (i == 0) y[j] = (double)j*dy + base_y;
            phi[i][j] = 0.0;
        }
    }
    double phi_l_in[size], // ghost cell messaging arrays
           phi_r_in[size],
           phi_t_in[size],
           phi_b_in[size],
           phi_l_out[size],
           phi_r_out[size],
           phi_t_out[size],
           phi_b_out[size];
    for (int i = 0; i < size; i++) {
        phi_l_in[i]  = 0.0;
        phi_r_in[i]  = 0.0;
        phi_t_in[i]  = 0.0;
        phi_b_in[i]  = 0.0;
    }
    //printf("%d: (%f,%f)-(%f,%f)\n", rank_id, base_x, base_y, x[size-1], y[size-1]);
    
    //  Set up a few random points and magnitudes for the electrostatics problem.
    int K = 80;
    struct point_t pt_srcs[K];
    for (int k = 0; k < K; k++) {
        (uniform_rng()); // proof that the RNG is bad:  without this the first number is the same on every thread.
        int x_rn = (int)(uniform_rng() * size);
        int y_rn = (int)(uniform_rng() * size);
        pt_srcs[k].x   = x_rn;
        pt_srcs[k].y   = y_rn;
        pt_srcs[k].mag = uniform_rng() * 2.0 - 1.0;
        //printf("%d: [%d, %d] (%f, %f) @ %f\n", rank_id, x_rn, y_rn, x[pt_srcs[k].x], y[pt_srcs[k].y], pt_srcs[k].mag);
    }
    
    //  Iterate forward.
    int    n_steps = 0;        // total number of steps iterated
    double inveps = 1.0 / eps; // saves a division every iteration over the square loop
    double pt_src = 0.0;       // accumulator for whether a point source is located at a specific (i,j) index site
    while (n_steps < iter) {
        memcpy(oldphi, phi, size*size); // Back up the old matrix for residual comparison, if desired.
        //  Propagate the matrix equation forward towards a solution.
        for (int i = 0; i < size; i++) {
            for (int j = 0; j < size; j++) {
                //  Calculate point source contributions.
                pt_src = 0;
                for (int k = 0; k < K; k++) {
                    pt_src = pt_src + ((pt_srcs[k].x == i && pt_srcs[k].y == j) ? pt_srcs[k].mag : 0.0);
                }
                phi[i][j] = 0.25*dx*dx * pt_src * inveps
                          + 0.25*(i == 0      ? (nbr_l < 0 ? 0.0 : phi_l_in[j]) : phi[i-1][j])
                          + 0.25*(i == size-1 ? (nbr_r < 0 ? 0.0 : phi_r_in[j]) : phi[i+1][j])
                          + 0.25*(j == 0      ? (nbr_b < 0 ? 0.0 : phi_b_in[i]) : phi[i][j-1])
                          + 0.25*(j == size-1 ? (nbr_t < 0 ? 0.0 : phi_t_in[i]) : phi[i][j+1]);
            }
        }
        
        //  Communicate the border cell information to neighboring processes.  We will alternate odd and even processes,
        //  although there are more sophisticated ways to do this with nonblocking message passing.  Why do we alternate?
        for (int i = 0; i < size; i++) {
            phi_l_out[i] = phi[0][i];
            phi_r_out[i] = phi[size-1][i];
            phi_t_out[i] = phi[i][size-1];
            phi_b_out[i] = phi[i][0];
        }
        //    Pass data left.
        if (nbr_l >= 0) MPI_Isend(phi_l_out, size, MPI_DOUBLE, nbr_l, 0,           MPI_COMM_WORLD, &request);
        if (nbr_r >= 0) MPI_Irecv(phi_r_in,  size, MPI_DOUBLE, nbr_r, MPI_ANY_TAG, MPI_COMM_WORLD, &request);
        
        //    Pass data right.
        if (nbr_r >= 0) MPI_Isend(phi_r_out, size, MPI_DOUBLE, nbr_r, 0,           MPI_COMM_WORLD, &request);
        if (nbr_l >= 0) MPI_Irecv(phi_l_in,  size, MPI_DOUBLE, nbr_l, MPI_ANY_TAG, MPI_COMM_WORLD, &request);
        
        //    Pass data up.
        if (nbr_t >= 0) MPI_Isend(phi_t_out, size, MPI_DOUBLE, nbr_t, 0,           MPI_COMM_WORLD, &request);
        if (nbr_b >= 0) MPI_Irecv(phi_b_in,  size, MPI_DOUBLE, nbr_b, MPI_ANY_TAG, MPI_COMM_WORLD, &request);
        
        //    Pass data down.
        if (nbr_b >= 0) MPI_Isend(phi_b_out, size, MPI_DOUBLE, nbr_b, 0,           MPI_COMM_WORLD, &request);
        if (nbr_t >= 0) MPI_Irecv(phi_t_in,  size, MPI_DOUBLE, nbr_t, MPI_ANY_TAG, MPI_COMM_WORLD, &request);
        MPI_Barrier(MPI_COMM_WORLD);
        
        //  Output information periodically.
        if (rank_id == 0 && n_steps % out_step == 0) {
            printf("Iteration #%d:\n", n_steps);
            printf("\tphi(%6.3f, %6.3f) = %24.20f\n", x[(int)(0.5*size-1)], y[(int)(0.25*size+1)], phi[(int)(0.5*size-1)][(int)(0.25*size+1)]);
        }
        n_steps++;
    }
    if (rank_id == 0) {
        printf("Terminated after %d steps.\n", n_steps);
        printf("\tphi(%6.3f, %6.3f) = %24.20f\n", x[(int)(0.5*size-1)], y[(int)(0.25*size+1)], phi[(int)(0.5*size-1)][(int)(0.25*size+1)]);
    }
    
    //  Write the final condition out to disk and terminate.
    //  Parallel I/O, while supported by MPI, is a whole other ball game and we won't go into here.
    //  Thus we will gather all of the data to process rank 0 and output it from there.
    double phi_all[size*base*size*base];
    //  This version of the code ^ incorrectly orders the processor output.  We actually have to transpose each process's
    //  data into a column-major format to get everything aligned properly.
    double phi_temp[size][size];
    for (int i = 0; i < size; i++) {
        for (int j = 0; j < size; j++) {
            phi_temp[j][i] = phi[i][j];
        }
    }
    MPI_Gather(&phi_temp[0][0], size*size, MPI_DOUBLE, &phi_all[0], size*size, MPI_DOUBLE, 0, MPI_COMM_WORLD);
    //  At this point, the data are in the right array, but have an odd stride and relative order.  We'll fix that now.
    double phi_new[size*base][size*base];
    int offset_x, offset_y;
    for (int p = 0; p < base*base; p++) {     // source process for data
        offset_x = (p-p%base)/base;
        offset_y = (p%base);
        for (int i = 0; i < size; i++) {      // x-index of process data
            for (int j = 0; j < size; j++) {  // y-index of process data
                phi_new[offset_y*size+j][offset_x*size+i] = phi_all[p*size*size+i*size+j];
            }
        }
    }
    
    if (rank_id == 0) {
        FILE* f;
        char filename[16];
        f = fopen("data.txt", "w"); // wb -write binary
        if (f != NULL) {
            for (int i = 0; i < base*size; i++) {
                for (int j = 0; j < base*size; j++) {
                    fprintf(f, "%f\t", phi_new[i][j]);
                }
                fprintf(f, "\n");
            }
            fclose(f);
        } else {
            //failed to create the file
        }
    }
    
    MPI_Finalize();
    return 0;
}


!mpicc -std=c99 -o fd_charge fd_charge.c
!mpiexec -np 9 ./fd_charge 100 100001 5000

# Use Python to visualize the result quickly.
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import cm
%matplotlib inline

data = np.loadtxt('./data.txt')
mx = 10**np.floor(np.log10(data.max()))

fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(16,16))
axes.imshow(data, cmap=cm.seismic, vmin=-mx, vmax=mx)
fig.show()

fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(16,16))
axes.contour(data[::-1,:], cmap=cm.bwr, vmin=-mx, vmax=mx, levels=np.arange(-mx,mx+1,(2*mx)/1e2))
fig.show()

%%file parprefix.c
// your code here

//  THIS CODE OUTPUTS THE RESULTS SERIALLY TO DISK FROM EACH PROCESS.
    //  Write the final condition out to disk and terminate.
    //  Parallel I/O, while supported by MPI, is a whole other ball game and we won't go into here.
    //  Thus we will gather all of the data to process rank 0 and output it from there.
    double phi_all[size*base][size*base];
    //MPI_Send(&phi[0][0], base*N*base*N, MPI_DOUBLE
    
    FILE* f;
    char filename[16];
    sprintf(filename, "./data-%d.txt", rank_id);
    f = fopen(filename, "w"); // wb -write binary
    if (f != NULL) {
        for (int i = 0; i < size; i++) {
            for (int j = 0; j < size; j++) {
                fprintf(f, "%f\t", phi[i][j]);
            }
            fprintf(f, "\n");
        }
        fclose(f);
    } else {
        //failed to create the file
    }
    

# Use Python to visualize the result quickly.
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import cm
%matplotlib inline
#mpl.rcParams['figure.figsize']=[20,20]

N = 9
data = []
grdt = []
for i in range(0, N):
    data.append(np.loadtxt('./data-%d.txt'%i))
    grdt.append(np.gradient(data[-1]))

fig, axes = plt.subplots(nrows=(int)(np.sqrt(N)), ncols=(int)(np.sqrt(N)), figsize=(12,12))
for i in range(0, (int)(np.sqrt(N))):
    for j in range(0, (int)(np.sqrt(N))):
        axes[i][j].imshow(data[i*(int)(np.sqrt(N))+j].T, cmap=cm.seismic, vmin=-3.5e6, vmax=3.5e6, extent=[-1,1,-1,1])
        #axes[i][j].contour(data[i*(int)(np.sqrt(N))+j].T, cmap=cm.bwr, vmin=-1.5e6, vmax=1.5e6, levels=np.arange(-1e7,1e7+1,2e5))
fig.show()