%%file hello_world_mpi.c #include #include int main(int argc, char *argv[]) { int rank_id, ierr, num_ranks; double start_time, wall_time; printf("C MPI minimal working example\n"); ierr = MPI_Init(&argc, &argv); start_time = MPI_Wtime(); ierr = MPI_Comm_size(MPI_COMM_WORLD, &num_ranks); ierr = MPI_Comm_rank(MPI_COMM_WORLD, &rank_id); if (rank_id == 0) { printf("Number of available processors = %d.\n", num_ranks); } printf("\tProcess number %d branching off.\n", rank_id); wall_time = MPI_Wtime() - start_time; if (rank_id == 0) { printf("Elapsed wallclock time = %8.6fs.\n", wall_time); } MPI_Finalize(); return 0; } !mpicc -o hello_world_mpi hello_world_mpi.c !./hello_world_mpi !mpiexec ./hello_world_mpi !mpiexec -np 1 ./hello_world_mpi %%file darts_pi.c #include #include #include #include #include #define N 100000 #define R 1 double uniform_rng() { return (double)rand() / (double)RAND_MAX; } // Not great but not the point of this exercise. int main(int argc, char *argv[]) { int rank_id, ierr, num_ranks; MPI_Status status; ierr = MPI_Init(&argc, &argv); ierr = MPI_Comm_size(MPI_COMM_WORLD, &num_ranks); ierr = MPI_Comm_rank(MPI_COMM_WORLD, &rank_id); srand(time(NULL) + rank_id); // Calculate some number of darts thrown into the circle. int n_circle = 0; double x, y; for (int t = 0; t < N; t++) { x = uniform_rng(); y = uniform_rng(); if (x*x + y*y < (double)R*R) n_circle++; } // If this is the first process, then gather everyone's data. Otherwise, send it to the first process. int total_circle[num_ranks]; // C99 Variable Length Arrays; compile with `-std=c99` or else use `malloc`. if (rank_id == 0) { total_circle[0] = n_circle; for (int i = 1; i < num_ranks; i++) { ierr = MPI_Recv(&total_circle[i], 1, MPI_INT, i, MPI_ANY_TAG, MPI_COMM_WORLD, &status); //printf ("\t%d: recv %d from %d\n", rank_id, total_circle[i], i); } } else { ierr = MPI_Send(&n_circle, 1, MPI_INT, 0, 1, MPI_COMM_WORLD); //printf ("\t%d: send %d to %d\n", rank_id, n_circle, 0); } // Now sum over the data and calculate the approximation of pi. int total = 0; if (rank_id == 0) { for (int i = 0; i < num_ranks; i++) { total += total_circle[i]; } printf("With %d trials, the resulting approximation to pi = %f.\n", num_ranks*N, 4.0*(double)total/((double)N*(double)num_ranks)); } MPI_Finalize(); return 0; } # To run this code from within the notebook. !mpicc -std=c99 -o darts_pi darts_pi.c !mpiexec -n 4 ./darts_pi %%file darts_pi.c #include #include #include #include #include #define N 100000 #define R 1 double uniform_rng() { return (double)rand() / (double)RAND_MAX; } // Not good but not the point of this exercise. int main(int argc, char *argv[]) { int rank_id, ierr, num_ranks; MPI_Status status; ierr = MPI_Init(&argc, &argv); ierr = MPI_Comm_size(MPI_COMM_WORLD, &num_ranks); ierr = MPI_Comm_rank(MPI_COMM_WORLD, &rank_id); srand(time(NULL) + rank_id); // Calculate some number of darts thrown into the circle. int n_circle = 0; double x, y; for (int t = 0; t < N; t++) { x = uniform_rng(); y = uniform_rng(); if (x*x + y*y < (double)R*R) n_circle++; } // Reduce by summation over all data and output the result. int total; ierr = MPI_Reduce(&n_circle, &total, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD); if (rank_id == 0) { printf("With %d trials, the resulting approximation to pi = %f.\n", num_ranks*N, 4.0*(double)total/((double)N*(double)num_ranks)); } MPI_Finalize(); return 0; } # To run this code from within the notebook. !mpicc -std=c99 -o darts_pi darts_pi.c !mpiexec -n 4 ./darts_pi %%file fd_charge.c // This is a serial version of the code for reference. #include #include #include #include //for memcpy() #include #define BOUNDS 1.0 double uniform_rng() { return (double)rand() / (double)RAND_MAX; } // Not good but not the point of this exercise. struct point_t { int x, y; double mag; }; int main(int argc, char *argv[]) { int N = 10; if (argc > 1) N = atoi(argv[1]); int size = N % 2 == 0 ? N : N+1; // ensure odd N int iter = 10; if (argc > 2) iter = atoi(argv[2]); int out_step = 1000; if (argc > 3) out_step = atoi(argv[3]); double eps = 8.8541878176e-12; // permittivity of the material srand(time(NULL)); // Initialize the grid with an initial guess of zeros as well as the coordinates. double phi[size][size]; // C99 Variable Length Arrays; compile with `-std=c99` or else use `malloc`. double oldphi[size][size]; double x[size], y[size]; double dx = (2*BOUNDS)/size, dy = dx; for (int i = 0; i < size; i++) { x[i] = (double)i*dx - BOUNDS; for (int j = 0; j < size; j++) { if (i == 0) y[j] = (double)j*dy - BOUNDS; phi[i][j] = 0.0; } } // Set up a few random points and magnitudes for the electrostatics problem. int K = 20; struct point_t pt_srcs[K]; for (int k = 0; k < K; k++) { pt_srcs[k].x = (int)(uniform_rng() * N); pt_srcs[k].y = (int)(uniform_rng() * N); pt_srcs[k].mag = uniform_rng() * 2.0 - 1.0; printf("(%f, %f) @ %f\n", x[pt_srcs[k].x], y[pt_srcs[k].y], pt_srcs[k].mag); } // Iterate forward. int n_steps = 0; // total number of steps iterated double inveps = 1.0 / eps; // saves a division every iteration over the square loop double pt_src = 0.0; // accumulator for whether a point source is located at a specific (i,j) index site while (n_steps < iter) { memcpy(oldphi, phi, size*size); for (int i = 0; i < size; i++) { for (int j = 0; j < size; j++) { // Calculate point source contributions. pt_src = 0; for (int k = 0; k < K; k++) { pt_src = pt_src + ((pt_srcs[k].x == i && pt_srcs[k].y == j) ? pt_srcs[k].mag : 0.0); } phi[i][j] = 0.25*dx*dx * pt_src * inveps + 0.25*(i == 0 ? 0.0 : phi[i-1][j]) + 0.25*(i == size-1 ? 0.0 : phi[i+1][j]) + 0.25*(j == 0 ? 0.0 : phi[i][j-1]) + 0.25*(j == size-1 ? 0.0 : phi[i][j+1]); } } if (n_steps % out_step == 0) { printf("Iteration #%d:\n", n_steps); printf("\tphi(%f, %f) = %24.20f\n", x[(int)(0.5*N-1)], y[(int)(0.25*N+1)], phi[(int)(0.5*N-1)][(int)(0.25*N+1)]); } n_steps++; } // Write the final condition out to disk and terminate. printf("Terminated after %d steps.\n", n_steps); FILE* f; f = fopen("./data.txt", "w"); // wb -write binary if (f != NULL) { for (int i = 0; i < size; i++) { for (int j = 0; j < size; j++) { fprintf(f, "%f\t", phi[i][j]); } fprintf(f, "\n"); } fclose(f); } else { //failed to create the file } return 0; } !mpicc -std=c99 -o fd_charge fd_charge.c !./fd_charge 200 80000 1000 # Use Python to visualize the result quickly. import numpy as np import matplotlib as mpl import matplotlib.pyplot as plt from matplotlib import cm %matplotlib inline #mpl.rcParams['figure.figsize']=[20,20] data = np.loadtxt('./data.txt') grdt = np.gradient(data); mx = 10**np.floor(np.log10(data.max())) fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20,20)) axes[0].imshow(data, cmap=cm.seismic, vmin=-mx, vmax=mx, extent=[-1,1,-1,1]) axes[1].imshow(data, cmap=cm.seismic, vmin=-mx, vmax=mx) #axes[1].quiver(grdt[0].T, grdt[1].T, scale=5e8) #only good for N < 50 axes[1].contour(data, cmap=cm.bwr, vmin=-mx, vmax=mx, levels=np.arange(-mx,mx+1,(2*mx)/1e2)) fig.show() %%file fd_charge.c // This is a parallel version of the code. #include #include #include #include //for memcpy() #include #include #define BOUNDS 1.0 double uniform_rng() { return (double)rand() / (double)RAND_MAX; } // Not good but not the point of this exercise. struct point_t { int x, y; double mag; }; int main(int argc, char *argv[]) { int size = 10; if (argc > 1) size = atoi(argv[1]); int iter = 10; if (argc > 2) iter = atoi(argv[2]); int out_step = 1000; if (argc > 3) out_step = atoi(argv[3]); double eps = 8.8541878176e-12; // permittivity of the material int rank_id, ierr, num_ranks; MPI_Status status; MPI_Request request; ierr = MPI_Init(&argc, &argv); ierr = MPI_Comm_size(MPI_COMM_WORLD, &num_ranks); ierr = MPI_Comm_rank(MPI_COMM_WORLD, &rank_id); srand(time(NULL) + rank_id); // We will restrict partitions of the domain (and thus processes) to square numbers for the sake of pedagogy. // Not fast, not elegant, but we only do it once. if (abs(sqrt((float)num_ranks)-(int)sqrt((float)num_ranks)) > 1e-6) { printf("Expected square number for number of processes; actual number %f.\n", (float)abs((float)sqrt((float)num_ranks))); MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER); return 0; } // Calculate the processor grid preparatory to the domain partition. We'll make the assignment into a // sqrt(N) x sqrt(N) array. This allows easy determination of neighboring processes. int base = (int)abs(sqrt(num_ranks)); int nbr_l = rank_id % base - 1 >= 0 ? rank_id - 1 : -1; //printf("%d: nbr_l = %d\n", rank_id, nbr_l); int nbr_r = rank_id % base + 1 < base ? rank_id + 1 : -1; //printf("%d: nbr_r = %d\n", rank_id, nbr_r); int nbr_t = rank_id + base < num_ranks? rank_id + base : -1; //printf("%d: nbr_t = %d\n", rank_id, nbr_t); int nbr_b = rank_id - base >= 0 ? rank_id - base : -1; //printf("%d: nbr_b = %d\n", rank_id, nbr_b); // Initialize the _local_ grid with an initial guess of zeros as well as the coordinates. We will keep each // local domain the same size as the original domain, 2x2 in coordinates, and just translate them to make a // larger domain---in this case, adding processes increases the domain size rather than the mesh resolution. double phi[size][size]; // C99 Variable Length Arrays; compile with `-std=c99` or else use `malloc`. double oldphi[size][size]; double x[size], y[size]; double dx = (2*BOUNDS)/size, dy = dx; double base_x = 2*BOUNDS*(rank_id % base) - BOUNDS*base, base_y = 2*BOUNDS*(rank_id - (rank_id % base)) / base - BOUNDS*base; for (int i = 0; i < size; i++) { x[i] = (double)i*dx + base_x; for (int j = 0; j < size; j++) { if (i == 0) y[j] = (double)j*dy + base_y; phi[i][j] = 0.0; } } double phi_l_in[size], // ghost cell messaging arrays phi_r_in[size], phi_t_in[size], phi_b_in[size], phi_l_out[size], phi_r_out[size], phi_t_out[size], phi_b_out[size]; for (int i = 0; i < size; i++) { phi_l_in[i] = 0.0; phi_r_in[i] = 0.0; phi_t_in[i] = 0.0; phi_b_in[i] = 0.0; } //printf("%d: (%f,%f)-(%f,%f)\n", rank_id, base_x, base_y, x[size-1], y[size-1]); // Set up a few random points and magnitudes for the electrostatics problem. int K = 80; struct point_t pt_srcs[K]; for (int k = 0; k < K; k++) { (uniform_rng()); // proof that the RNG is bad: without this the first number is the same on every thread. int x_rn = (int)(uniform_rng() * size); int y_rn = (int)(uniform_rng() * size); pt_srcs[k].x = x_rn; pt_srcs[k].y = y_rn; pt_srcs[k].mag = uniform_rng() * 2.0 - 1.0; //printf("%d: [%d, %d] (%f, %f) @ %f\n", rank_id, x_rn, y_rn, x[pt_srcs[k].x], y[pt_srcs[k].y], pt_srcs[k].mag); } // Iterate forward. int n_steps = 0; // total number of steps iterated double inveps = 1.0 / eps; // saves a division every iteration over the square loop double pt_src = 0.0; // accumulator for whether a point source is located at a specific (i,j) index site while (n_steps < iter) { memcpy(oldphi, phi, size*size); // Back up the old matrix for residual comparison, if desired. // Propagate the matrix equation forward towards a solution. for (int i = 0; i < size; i++) { for (int j = 0; j < size; j++) { // Calculate point source contributions. pt_src = 0; for (int k = 0; k < K; k++) { pt_src = pt_src + ((pt_srcs[k].x == i && pt_srcs[k].y == j) ? pt_srcs[k].mag : 0.0); } phi[i][j] = 0.25*dx*dx * pt_src * inveps + 0.25*(i == 0 ? (nbr_l < 0 ? 0.0 : phi_l_in[j]) : phi[i-1][j]) + 0.25*(i == size-1 ? (nbr_r < 0 ? 0.0 : phi_r_in[j]) : phi[i+1][j]) + 0.25*(j == 0 ? (nbr_b < 0 ? 0.0 : phi_b_in[i]) : phi[i][j-1]) + 0.25*(j == size-1 ? (nbr_t < 0 ? 0.0 : phi_t_in[i]) : phi[i][j+1]); } } // Communicate the border cell information to neighboring processes. We will alternate odd and even processes, // although there are more sophisticated ways to do this with nonblocking message passing. Why do we alternate? for (int i = 0; i < size; i++) { phi_l_out[i] = phi[0][i]; phi_r_out[i] = phi[size-1][i]; phi_t_out[i] = phi[i][size-1]; phi_b_out[i] = phi[i][0]; } // Pass data left. if (nbr_l >= 0) MPI_Isend(phi_l_out, size, MPI_DOUBLE, nbr_l, 0, MPI_COMM_WORLD, &request); if (nbr_r >= 0) MPI_Irecv(phi_r_in, size, MPI_DOUBLE, nbr_r, MPI_ANY_TAG, MPI_COMM_WORLD, &request); // Pass data right. if (nbr_r >= 0) MPI_Isend(phi_r_out, size, MPI_DOUBLE, nbr_r, 0, MPI_COMM_WORLD, &request); if (nbr_l >= 0) MPI_Irecv(phi_l_in, size, MPI_DOUBLE, nbr_l, MPI_ANY_TAG, MPI_COMM_WORLD, &request); // Pass data up. if (nbr_t >= 0) MPI_Isend(phi_t_out, size, MPI_DOUBLE, nbr_t, 0, MPI_COMM_WORLD, &request); if (nbr_b >= 0) MPI_Irecv(phi_b_in, size, MPI_DOUBLE, nbr_b, MPI_ANY_TAG, MPI_COMM_WORLD, &request); // Pass data down. if (nbr_b >= 0) MPI_Isend(phi_b_out, size, MPI_DOUBLE, nbr_b, 0, MPI_COMM_WORLD, &request); if (nbr_t >= 0) MPI_Irecv(phi_t_in, size, MPI_DOUBLE, nbr_t, MPI_ANY_TAG, MPI_COMM_WORLD, &request); MPI_Barrier(MPI_COMM_WORLD); // Output information periodically. if (rank_id == 0 && n_steps % out_step == 0) { printf("Iteration #%d:\n", n_steps); printf("\tphi(%6.3f, %6.3f) = %24.20f\n", x[(int)(0.5*size-1)], y[(int)(0.25*size+1)], phi[(int)(0.5*size-1)][(int)(0.25*size+1)]); } n_steps++; } if (rank_id == 0) { printf("Terminated after %d steps.\n", n_steps); printf("\tphi(%6.3f, %6.3f) = %24.20f\n", x[(int)(0.5*size-1)], y[(int)(0.25*size+1)], phi[(int)(0.5*size-1)][(int)(0.25*size+1)]); } // Write the final condition out to disk and terminate. // Parallel I/O, while supported by MPI, is a whole other ball game and we won't go into here. // Thus we will gather all of the data to process rank 0 and output it from there. double phi_all[size*base*size*base]; // This version of the code ^ incorrectly orders the processor output. We actually have to transpose each process's // data into a column-major format to get everything aligned properly. double phi_temp[size][size]; for (int i = 0; i < size; i++) { for (int j = 0; j < size; j++) { phi_temp[j][i] = phi[i][j]; } } MPI_Gather(&phi_temp[0][0], size*size, MPI_DOUBLE, &phi_all[0], size*size, MPI_DOUBLE, 0, MPI_COMM_WORLD); // At this point, the data are in the right array, but have an odd stride and relative order. We'll fix that now. double phi_new[size*base][size*base]; int offset_x, offset_y; for (int p = 0; p < base*base; p++) { // source process for data offset_x = (p-p%base)/base; offset_y = (p%base); for (int i = 0; i < size; i++) { // x-index of process data for (int j = 0; j < size; j++) { // y-index of process data phi_new[offset_y*size+j][offset_x*size+i] = phi_all[p*size*size+i*size+j]; } } } if (rank_id == 0) { FILE* f; char filename[16]; f = fopen("data.txt", "w"); // wb -write binary if (f != NULL) { for (int i = 0; i < base*size; i++) { for (int j = 0; j < base*size; j++) { fprintf(f, "%f\t", phi_new[i][j]); } fprintf(f, "\n"); } fclose(f); } else { //failed to create the file } } MPI_Finalize(); return 0; } !mpicc -std=c99 -o fd_charge fd_charge.c !mpiexec -np 9 ./fd_charge 100 100001 5000 # Use Python to visualize the result quickly. import numpy as np import matplotlib as mpl import matplotlib.pyplot as plt from matplotlib import cm %matplotlib inline data = np.loadtxt('./data.txt') mx = 10**np.floor(np.log10(data.max())) fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(16,16)) axes.imshow(data, cmap=cm.seismic, vmin=-mx, vmax=mx) fig.show() fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(16,16)) axes.contour(data[::-1,:], cmap=cm.bwr, vmin=-mx, vmax=mx, levels=np.arange(-mx,mx+1,(2*mx)/1e2)) fig.show() %%file parprefix.c // your code here // THIS CODE OUTPUTS THE RESULTS SERIALLY TO DISK FROM EACH PROCESS. // Write the final condition out to disk and terminate. // Parallel I/O, while supported by MPI, is a whole other ball game and we won't go into here. // Thus we will gather all of the data to process rank 0 and output it from there. double phi_all[size*base][size*base]; //MPI_Send(&phi[0][0], base*N*base*N, MPI_DOUBLE FILE* f; char filename[16]; sprintf(filename, "./data-%d.txt", rank_id); f = fopen(filename, "w"); // wb -write binary if (f != NULL) { for (int i = 0; i < size; i++) { for (int j = 0; j < size; j++) { fprintf(f, "%f\t", phi[i][j]); } fprintf(f, "\n"); } fclose(f); } else { //failed to create the file } # Use Python to visualize the result quickly. import numpy as np import matplotlib as mpl import matplotlib.pyplot as plt from matplotlib import cm %matplotlib inline #mpl.rcParams['figure.figsize']=[20,20] N = 9 data = [] grdt = [] for i in range(0, N): data.append(np.loadtxt('./data-%d.txt'%i)) grdt.append(np.gradient(data[-1])) fig, axes = plt.subplots(nrows=(int)(np.sqrt(N)), ncols=(int)(np.sqrt(N)), figsize=(12,12)) for i in range(0, (int)(np.sqrt(N))): for j in range(0, (int)(np.sqrt(N))): axes[i][j].imshow(data[i*(int)(np.sqrt(N))+j].T, cmap=cm.seismic, vmin=-3.5e6, vmax=3.5e6, extent=[-1,1,-1,1]) #axes[i][j].contour(data[i*(int)(np.sqrt(N))+j].T, cmap=cm.bwr, vmin=-1.5e6, vmax=1.5e6, levels=np.arange(-1e7,1e7+1,2e5)) fig.show()