%%file hello_world_mpi.c
#include <stdio.h>
#include <mpi.h>

int main(int argc, char *argv[]) {
    int     rank_id, ierr, num_ranks;
    double  start_time, wall_time;
    
    printf("C MPI minimal working example\n");
    
    ierr = MPI_Init(&argc, &argv);
    start_time = MPI_Wtime();
    ierr = MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);
    ierr = MPI_Comm_rank(MPI_COMM_WORLD, &rank_id);
    
    if (rank_id == 0) {
        printf("Number of available processors = %d.\n", num_ranks);
    }
    printf("\tProcess number %d branching off.\n", rank_id);
    
    wall_time = MPI_Wtime() - start_time;
    
    if (rank_id == 0) {
        printf("Elapsed wallclock time = %8.6fs.\n", wall_time);
    }
    
    MPI_Finalize();
    return 0;
}

!mpicc -o hello_world_mpi hello_world_mpi.c
!./hello_world_mpi

!mpiexec ./hello_world_mpi

!mpiexec -np 8 ./hello_world_mpi

%%file darts_pi.c
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <mpi.h>

#define N 100000
#define R 1

double uniform_rng() { return (double)rand() / (double)RAND_MAX; } // Not great but not the point of this exercise.

int main(int argc, char *argv[]) {
    int     rank_id, ierr, num_ranks;
    MPI_Status status;
    
    ierr = MPI_Init(&argc, &argv);
    ierr = MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);
    ierr = MPI_Comm_rank(MPI_COMM_WORLD, &rank_id);
    srand(time(NULL) + rank_id);
    
    //  Calculate some number of darts thrown into the circle.
    int n_circle = 0;
    double x, y;
    for (int t = 0; t < N; t++) {
        x = uniform_rng();
        y = uniform_rng();
        if (x*x + y*y < (double)R*R) n_circle++;
    }
    
    //  If this is the first process, then gather everyone's data.  Otherwise, send it to the first process.
    int total_circle[num_ranks]; // C99 Variable Length Arrays; compile with `-std=c99` or else use `malloc`.
    if (rank_id == 0) {
        total_circle[0] = n_circle;
        for (int i = 1; i < num_ranks; i++) {
            ierr = MPI_Recv(&total_circle[i], 1, MPI_INT, i, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
            //printf ("\t%d: recv %d from %d\n", rank_id, total_circle[i], i);
        }
    } else {
        ierr = MPI_Send(&n_circle, 1, MPI_INT, 0, 1, MPI_COMM_WORLD);
        //printf ("\t%d: send %d to %d\n", rank_id, n_circle, 0);
    }
    
    //  Now sum over the data and calculate the approximation of pi.
    int total = 0;
    if (rank_id == 0) {
        for (int i = 0; i < num_ranks; i++) {
            total += total_circle[i];
        }
        printf("With %d trials, the resulting approximation to pi = %f.\n", num_ranks*N, 4.0*(double)total/((double)N*(double)num_ranks));
    }
    
    MPI_Finalize();
    return 0;
}

# To run this code from within the notebook.
!mpicc -std=c99 -o darts_pi darts_pi.c
!mpiexec -n 4 ./darts_pi

%%file darts_pi.c
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <mpi.h>

#define N 100000
#define R 1

double uniform_rng() { return (double)rand() / (double)RAND_MAX; } // Not good but not the point of this exercise.

int main(int argc, char *argv[]) {
    int     rank_id, ierr, num_ranks;
    MPI_Status status;
    
    ierr = MPI_Init(&argc, &argv);
    ierr = MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);
    ierr = MPI_Comm_rank(MPI_COMM_WORLD, &rank_id);
    srand(time(NULL) + rank_id);
    
    //  Calculate some number of darts thrown into the circle.
    int n_circle = 0;
    double x, y;
    for (int t = 0; t < N; t++) {
        x = uniform_rng();
        y = uniform_rng();
        if (x*x + y*y < (double)R*R) n_circle++;
    }
    
    //  Reduce by summation over all data and output the result.
    int total;
    ierr = MPI_Reduce(&n_circle, &total, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
    if (rank_id == 0) {
        printf("With %d trials, the resulting approximation to pi = %f.\n", num_ranks*N, 4.0*(double)total/((double)N*(double)num_ranks));
    }
    
    MPI_Finalize();
    return 0;
}

# To run this code from within the notebook.
!mpicc -std=c99 -o darts_pi darts_pi.c
!mpiexec -n 4 ./darts_pi

%%file fd_charge.c
//  This is a serial version of the code for reference.
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h> //for memcpy()
#include <time.h>

#define BOUNDS 1.0

double uniform_rng() { return (double)rand() / (double)RAND_MAX; } // Not good but not the point of this exercise.
struct point_t { int x, y; double mag; };

int main(int argc, char *argv[]) {
    int N = 10;
    if (argc > 1) N = atoi(argv[1]);
    int size = N % 2 == 0 ? N : N+1;  // ensure odd N
    int iter = 10;
    if (argc > 2) iter = atoi(argv[2]);
    int out_step = 1000;
    if (argc > 3) out_step = atoi(argv[3]);
    double eps = 8.8541878176e-12; // permittivity of the material
    srand(time(NULL));
    
    //  Initialize the grid with an initial guess of zeros as well as the coordinates.
    double phi[size][size];    // C99 Variable Length Arrays; compile with `-std=c99` or else use `malloc`.
    double oldphi[size][size];
    double x[size],
           y[size];
    double dx = (2*BOUNDS)/size,
           dy = dx;
    for (int i = 0; i < size; i++) {
        x[i] = (double)i*dx - BOUNDS;
        for (int j = 0; j < size; j++) {
            if (i == 0) y[j] = (double)j*dy - BOUNDS;
            phi[i][j] = 0.0;
        }
    }
    //  Set up a few random points and magnitudes for the electrostatics problem.
    int K = 20;
    struct point_t pt_srcs[K];
    for (int k = 0; k < K; k++) {
        pt_srcs[k].x   = (int)(uniform_rng() * N);
        pt_srcs[k].y   = (int)(uniform_rng() * N);
        pt_srcs[k].mag = uniform_rng() * 2.0 - 1.0;
        printf("(%f, %f) @ %f\n", x[pt_srcs[k].x], y[pt_srcs[k].y], pt_srcs[k].mag);
    }
    
    //  Iterate forward.
    int    n_steps = 0;        // total number of steps iterated
    double inveps = 1.0 / eps; // saves a division every iteration over the square loop
    double pt_src = 0.0;       // accumulator for whether a point source is located at a specific (i,j) index site
    while (n_steps < iter) {
        memcpy(oldphi, phi, size*size);
        for (int i = 0; i < size; i++) {
            for (int j = 0; j < size; j++) {
                //  Calculate point source contributions.
                pt_src = 0;
                for (int k = 0; k < K; k++) {
                    pt_src = pt_src + ((pt_srcs[k].x == i && pt_srcs[k].y == j) ? pt_srcs[k].mag : 0.0);
                }
                phi[i][j] = 0.25*dx*dx * pt_src * inveps
                          + 0.25*(i == 0      ? 0.0 : phi[i-1][j])
                          + 0.25*(i == size-1 ? 0.0 : phi[i+1][j])
                          + 0.25*(j == 0      ? 0.0 : phi[i][j-1])
                          + 0.25*(j == size-1 ? 0.0 : phi[i][j+1]);
            }
        }
        if (n_steps % out_step == 0) {
            printf("Iteration #%d:\n", n_steps);
            printf("\tphi(%f, %f) = %24.20f\n", x[(int)(0.5*N-1)], y[(int)(0.25*N+1)], phi[(int)(0.5*N-1)][(int)(0.25*N+1)]);
        }
        n_steps++;
    }
    
    //  Write the final condition out to disk and terminate.
    printf("Terminated after %d steps.\n", n_steps);
    
    FILE* f;
    f = fopen("./data.txt", "w"); // wb -write binary
    if (f != NULL) {
        for (int i = 0; i < size; i++) {
            for (int j = 0; j < size; j++) {
                fprintf(f, "%f\t", phi[i][j]);
            }
            fprintf(f, "\n");
        }
        fclose(f);
    } else {
        //failed to create the file
    }

    return 0;
}


!gcc -std=c99 -o fd_charge fd_charge.c
!./fd_charge 200 30000 1000

# Use Python to visualize the result quickly.
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import cm
%matplotlib inline
#mpl.rcParams['figure.figsize']=[20,20]

data = np.loadtxt('./data.txt')
grdt = np.gradient(data);
mx = 10**np.floor(np.log10(data.max()))

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20,20))
axes[0].imshow(data, cmap=cm.seismic, vmin=-mx, vmax=mx, extent=[-1,1,-1,1])
axes[1].imshow(data, cmap=cm.seismic, vmin=-mx, vmax=mx)
axes[1].contour(data, cmap=cm.bwr, vmin=-mx, vmax=mx, levels=np.arange(-mx,mx+1,(2*mx)/1e2))
fig.show()

%%file fd_charge.c
//  This is a parallel version of the code.  The domain consists of a column of square domains abutting each other.
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h> //for memcpy()
#include <time.h>
#include <mpi.h>

#define BOUNDS 1.0

double uniform_rng() { return (double)rand() / (double)RAND_MAX; } // Not good but not the point of this exercise.
struct point_t { int x, y; double mag; };

int main(int argc, char *argv[]) {
    //  Get number of discrete steps along domain.
    int size = 50;
    if (argc > 1) size = atoi(argv[1]);
    
    //  Get number of iterations to use in solver.
    int iter = 10000;
    if (argc > 2) iter = atoi(argv[2]);
    
    //  Get interval for output of test value in convergence.
    int out_step = iter/10;
    if (argc > 3) out_step = atoi(argv[3]);
    
    double eps = 8.8541878176e-12; // permittivity of the material
    
    int     rank_id, ierr, num_ranks;
    MPI_Status  status;
    MPI_Request request;
    
    ierr = MPI_Init(&argc, &argv);
    ierr = MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);
    ierr = MPI_Comm_rank(MPI_COMM_WORLD, &rank_id);
    srand(time(NULL) + rank_id);
    
    //  We will restrict the domain (and thus processes) to only two neighboring processes for the sake of pedagogy.
    if (num_ranks != 2) {
        printf("Expected 2 processes; actual number of processes %d.\n", num_ranks);
        MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER);
        return 0;
    }
    
    //  Calculate the processor grid preparatory to the domain partition.  We'll make the assignment into a
    //  2 x 1 array, making determination of neighboring processes trivial.
    int nbr_t, nbr_b;
    if (rank_id == 0) {
        nbr_t = -1;
        nbr_b =  1;
    } else {
        nbr_t =  0;
        nbr_b = -1;
    }
    
    //  Initialize the _local_ grid with an initial guess of zeros as well as the coordinates.  We will keep each
    //  local domain the same size as the original domain, 2x2 in coordinates, and just translate them to make a
    //  larger domain---in this case, adding processes increases the domain size rather than the mesh resolution.
    double phi[size][size];     // potential grid---compile with `-std=c99` or else use `malloc`.
    double x[size],             // x-domain vector
           y[size];             // y-domain vector
    double dx = (2*BOUNDS)/size,// increment in x-direction
           dy = dx;             // increment in y-direction
    double base_x = -BOUNDS,                            // [-1,1) for both processes
           base_y = 2*BOUNDS*(rank_id % 2) - BOUNDS*2;  // [-2,0) for process 0; [0,2) for process 1
    for (int i = 0; i < size; i++) {
        x[i] = (double)i*dx + base_x;
        for (int j = 0; j < size; j++) {
            if (i == 0) y[j] = (double)j*dy + base_y;
            phi[i][j] = 0.0;
        }
    }
    double phi_t_in[size], // ghost cell messaging arrays
           phi_b_in[size],
           phi_t_out[size],
           phi_b_out[size];
    for (int i = 0; i < size; i++) {
        phi_t_in[i]  = 0.0;
        phi_b_in[i]  = 0.0;
    }
    
    //  Set up a few random points and magnitudes for the electrostatics problem.
    int K = 10;
    struct point_t pt_srcs[K];
    for (int k = 0; k < K; k++) {
        (uniform_rng()); // demonstration that the RNG is bad:  without this the first number is the same on every process
        int x_rn = (int)(uniform_rng() * size);
        int y_rn = (int)(uniform_rng() * size);
        pt_srcs[k].x   = x_rn;
        pt_srcs[k].y   = y_rn;
        pt_srcs[k].mag = uniform_rng() * 2.0 - 1.0;
    }
    
    //  Iterate forward.
    int    n_steps = 0;        // total number of steps iterated
    double inveps = 1.0 / eps; // saves a division every iteration over the square loop
    double pt_src = 0.0;       // accumulator for whether a point source is located at a specific (i,j) index site
    while (n_steps < iter) {
        //  Propagate the matrix equation forward towards a solution.
        for (int i = 0; i < size; i++) {
            for (int j = 0; j < size; j++) {
                //  Calculate point source contributions.
                pt_src = 0;
                for (int k = 0; k < K; k++) {
                    pt_src = pt_src + ((pt_srcs[k].x == i && pt_srcs[k].y == j) ? pt_srcs[k].mag : 0.0);
                }
                phi[i][j] = 0.25*dx*dx * pt_src * inveps
                          + 0.25*(i == 0      ? 0.0 : phi[i-1][j])
                          + 0.25*(i == size-1 ? 0.0 : phi[i+1][j])
                          + 0.25*(j == 0      ? (nbr_b < 0 ? 0.0 : phi_b_in[i]) : phi[i][j-1])
                          + 0.25*(j == size-1 ? (nbr_t < 0 ? 0.0 : phi_t_in[i]) : phi[i][j+1]);
            }
        }
        
        //  Communicate the border cell information to neighboring processes.  We will alternate odd and even processes,
        //  although there are more sophisticated ways to do this with nonblocking message passing.  Why do we alternate?
        for (int i = 0; i < size; i++) {
            phi_t_out[i] = phi[i][size-1];
            phi_b_out[i] = phi[i][0];
        }
        //    Pass data up.
        if (nbr_t >= 0) MPI_Isend(phi_t_out, size, MPI_DOUBLE, nbr_t, 0,           MPI_COMM_WORLD, &request);
        if (nbr_b >= 0) MPI_Irecv(phi_b_in,  size, MPI_DOUBLE, nbr_b, MPI_ANY_TAG, MPI_COMM_WORLD, &request);
        
        //    Pass data down.
        if (nbr_b >= 0) MPI_Isend(phi_b_out, size, MPI_DOUBLE, nbr_b, 0,           MPI_COMM_WORLD, &request);
        if (nbr_t >= 0) MPI_Irecv(phi_t_in,  size, MPI_DOUBLE, nbr_t, MPI_ANY_TAG, MPI_COMM_WORLD, &request);
        
        MPI_Barrier(MPI_COMM_WORLD);
        
        //  Output information periodically.
        if (rank_id == 0 && n_steps % out_step == 0) {
            printf("Iteration #%d:\n", n_steps);
            printf("\tphi(%6.3f, %6.3f) = %24.20f\n", x[(int)(0.5*size-1)], y[(int)(0.25*size+1)], phi[(int)(0.5*size-1)][(int)(0.25*size+1)]);
        }
        n_steps++;
    }
    if (rank_id == 0) {
        printf("Terminated after %d steps.\n", n_steps);
        printf("\tphi(%6.3f, %6.3f) = %24.20f\n", x[(int)(0.5*size-1)], y[(int)(0.25*size+1)], phi[(int)(0.5*size-1)][(int)(0.25*size+1)]);
    }
    
    //  Write the final condition out to disk and terminate.
    //  Parallel I/O, while supported by MPI, is a whole other ball game and we won't go into that here.
    //  Thus we will gather all of the data to process rank 0 and output it from there.
    double phi_vector[size*2*size];
    //  We actually have to transpose each process's data into a column-major format to align properly.
    double phi_trans[size][size];
    for (int i = 0; i < size; i++) {
        for (int j = 0; j < size; j++) {
            phi_trans[size-j-1][i] = phi[i][j];
        }
    }
    MPI_Gather(&phi_trans[0][0], size*size, MPI_DOUBLE, &phi_vector[0], size*size, MPI_DOUBLE, 0, MPI_COMM_WORLD);
    //  At this point, the data are in one array on process rank 0 but not in a two-dimensional array, so fix that.
    double phi_new[size][size*num_ranks];
    int offset;
    for (int p = 0; p < num_ranks; p++) {     // source process for data
        offset = p*size;
        for (int i = 0; i < size; i++) {      // x-index of process data
            for (int j = 0; j < size; j++) {  // y-index of process data
                phi_new[offset+i][j] = phi_vector[p*size*size+i*size+j];
            }
        }
    }
    
    if (rank_id == 0) {
        FILE* f;
        char filename[16];
        f = fopen("data.txt", "w"); // wb -write binary
        if (f != NULL) {
            for (int i = 0; i < 2*size; i++) {
                for (int j = 0; j < size; j++) {
                    fprintf(f, "%f\t", phi_new[i][j]);
                }
                fprintf(f, "\n");
            }
            fclose(f);
        } else {
            //failed to create the file
        }
    }
    
    MPI_Finalize();
    return 0;
}


!mpicc -std=c99 -o fd_charge fd_charge.c
!mpiexec -np 2 ./fd_charge 100 10001 1000

# Use Python to visualize the result quickly.
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import cm
%matplotlib inline

data = np.loadtxt('./data.txt')
#mx = 10**np.floor(np.log10(data.max()))
mx = 0.5*data.max()

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(8,8))
axes[0].imshow( data,         cmap=cm.seismic, vmin=-mx, vmax=mx)
axes[1].contour(data[::-1,:], cmap=cm.bwr,     vmin=-mx, vmax=mx, levels=np.arange(-8*mx,8*mx,mx/10))
fig.show()

%%file parprefix.c
// your code here