%%file openmp.cpp #include #include using namespace std; int main(int argc, char *argv[]) { int th_id, nthreads; #pragma omp parallel private(th_id) shared(nthreads) { th_id = omp_get_thread_num(); cout << "Hello World from thread " << th_id << '\n'; #pragma omp barrier #pragma omp single cout << "---" << endl; #pragma omp critical cout << "Hello World from thread " << th_id << '\n'; #pragma omp barrier cout << "There are " << omp_get_max_threads() << " threads" << '\n'; } return 0; } !g++ -Wall -fopenmp -O2 -pedantic openmp.cpp -o openmp !./openmp # Timing code in IPython %timeit !./openmp !g++ -Wall -O2 -pedantic openmp.cpp -o openmp_nopragma %%file int_trap_serial.cpp #include #include using namespace std; #define PI 3.14159265 #define N 1000 double integrand(double x) { return cos(x)/(2+cos(x)); } int main(int argc, char** argv) { double result = 0, x; double a = PI, b = 5*PI/2; double dx = (b - a) / N; for (int j = 1; j < N; j++) { x = dx * j; result += integrand(x); } // Add endpoint calculations. result += 1/2 * integrand(a); result += 1/2 * integrand(b); // Multiply by prefactor result *= dx; cout << "The calculated value is " << result << ".\n"; return 0; } !g++ -Wall -fopenmp -O2 -pedantic int_trap_serial.cpp -o int_trap_serial !./int_trap_serial %%file int_trap_omp.cpp #include #include using namespace std; #include #define PI 3.14159265 #define N 1000 double integrand(double x) { return cos(x)/(2+cos(x)); } int main(int argc, char** argv) { double result = 0, x; double a = PI, b = 5*PI/2; double dx = (b - a) / N; int j; #pragma omp parallel for private(x) reduction(+:result) schedule(static,1) for (j = 1; j < N; j++) { x = dx * j; result += integrand(x); } cout << "Using " << omp_get_max_threads() << " threads, "; // Add endpoint calculations. result += 1/2 * integrand(a); result += 1/2 * integrand(b); // Multiply by prefactor result *= dx; cout << "the calculated value is " << result << ".\n"; return 0; } !g++ -Wall -fopenmp -O2 -pedantic int_trap_omp.cpp -o int_trap_omp !./int_trap_omp %%file int_simp_omp.cpp #include #include using namespace std; #include #define PI 3.14159265 #define N 1000 double integrand(double x) { double value; // TODO return value; } int main(int argc, char** argv) { double result = 0, x; double a = 0, b = 2; double dx = (b - a) / N; int j; #pragma omp parallel for private(x) reduction(+:result) schedule(static,1) for (j = 1; j < N; j++) { //TODO } cout << "Using " << omp_get_max_threads() << " threads, "; // Add endpoint calculations. // TODO // Multiply by prefactor // TODO cout << "the calculated value is " << result << ".\n"; return 0; } !/usr/local/bin/g++-4.8 -Wall -fopenmp -O2 -std=c++11 -pedantic int_simp_omp.cpp -o int_simp_omp !./int_simp_omp %%file fib_serial.cpp #include #include using namespace std; long fib(int n) { if (n < 2) return n; else { long x, y; x = fib(n-1); y = fib(n-2); return x + y; } } int main(int argc, char** argv) { int n = atoi(argv[1]); // Note no error checking here. double F = fib(n); cout << "The " << n << "th Fibonacci number is " << F << ".\n"; return 0; } !/usr/local/bin/g++-4.8 -Wall -fopenmp -O2 -std=c++11 -pedantic fib_serial.cpp -o fib_serial !./fib_serial 18 # Don't go higher than 40. %%file fib_omp.cpp #include #include using namespace std; #include #include long fib(int n) { if (n < 2) return n; else { long x, y; #pragma omp task shared(x) x=fib(n-1); #pragma omp task shared(y) y=fib(n-2); #pragma omp taskwait return x+y; } } int main(int argc, char** argv) { int n = atoi(argv[1]); // Note no error checking here. double F; #pragma omp parallel F = fib(n); cout << "The " << n << "th Fibonacci number is " << F << ".\n"; return 0; } !g++ -Wall -fopenmp -O2 -pedantic fib_omp.cpp -o fib_omp !./fib_omp 18 # Don't go higher than 40. !g++ -Wall -O2 -pedantic fib_omp.cpp -o fib_nomp !./fib_nomp 8 %timeit !./fib_serial 18 %timeit !./fib_omp 18 %%file vadd.cpp #include #include #include using namespace std; #include #define N 16 int main(int argc, char** argv) { vector a(N); vector b(N); vector c(N); // Initialize in parallel. # pragma omp parallel for for (int i = 0; i < a.size(); i++) { a[i] = i; b[i] = 2*i; } // Add in parallel. # pragma omp parallel for for (int i = 0; i < a.size(); i++) { c[i] = a[i] + b[i]; } // Output result of calculation. for (int i = 0; i < a.size(); i++) { cout << a[i] << " + " << b[i] << "\t= " << c[i] << endl; } } !g++ -Wall -fopenmp -O2 -pedantic vadd.cpp -o vadd !./vadd !g++ -Wall -fopenmp -O2 -pedantic vadd.cpp -o vadd !./vadd %%file vmult.c #include #include #include #define N 16 int main(int argc, char** argv) { double a[N][N]; double b[N][N]; double c[N][N]; // your code here } !g++ -Wall -fopenmp -O2 -pedantic vmult.c -o vmult !./vmult %%file vinit.cpp #include using namespace std; #include int main(int argc, char** argv) { vector a(0); // Initialize a vector in parallel by adding new elements to it. # pragma omp parallel for for (int i = 0; i < 12; i++) { a.push_back(i); } } !g++ -Wall -fopenmp -O2 -pedantic vinit.cpp -o vinit !./vinit %%file mmult-block.c // Modified from an original example by Blaise Barney for LLNL. // https://computing.llnl.gov/tutorials/openMP/samples/C/omp_mm.c #include #include #include #define NRA 62 /* number of rows in matrix A */ #define NCA 15 /* number of columns in matrix A */ #define NCB 7 /* number of columns in matrix B */ int main (int argc, char *argv[]) { int tid, nthreads, i, j, k, chunk; double a[NRA][NCA], /* matrix A to be multiplied */ b[NCA][NCB], /* matrix B to be multiplied */ c[NRA][NCB]; /* result matrix C */ chunk = 10; /* set loop iteration chunk size */ #pragma omp parallel shared(a, b, c, nthreads, chunk) private(tid, i, j, k) { tid = omp_get_thread_num(); if (tid == 0) { nthreads = omp_get_num_threads(); printf("Starting matrix multiple example with %d threads\n", nthreads); } /*** Initialize matrices ***/ #pragma omp for schedule (static, chunk) for (i = 0; i < NRA; i++) for (j = 0; j < NCA; j++) a[i][j] = i + j; #pragma omp for schedule (static, chunk) for (i = 0; i < NCA; i++) for (j = 0; j < NCB; j++) b[i][j] = i * j; #pragma omp for schedule (static, chunk) for (i = 0; i < NRA; i++) for (j = 0; j < NCB; j++) c[i][j] = 0; /*** Do matrix multiply sharing iterations on outer loop ***/ /*** Display who does which iterations for demonstration purposes ***/ printf("Thread %d starting matrix multiply...\n",tid); #pragma omp for schedule (static, chunk) for (i = 0; i < NRA; i++) { printf("Thread=%d did row=%d\n",tid,i); for (j = 0; j < NCB; j++) for (k = 0; k < NCA; k++) c[i][j] += a[i][k] * b[k][j]; } } /*** end parallel section ***/ printf("\nFirst Matrix:\n"); for (i=0; i #include #include #include using namespace std; #define PI 3.1415926535 // The k-th term of the infinite product series for sine. static complex term(int k, complex z) { return (complex(1.0, 0.0) - pow(z, 2)/(pow(k, 2) * PI*PI)); } int main(int argc, char** argv) { int N = 10; if (argc > 1) N = atoi(argv[1]); complex z = complex(PI*0.25, PI*0.25); if (argc > 2) z = complex(atof(argv[2]), atof(argv[3])); complex res, i_res; double res_r = 1.0, res_i = 1.0; int j; #pragma omp parallel for private(i_res, j) reduction(*:res_r,res_i) schedule(static,1) for (j = 1; j < N; j++) { i_res = term(j, z); res_r *= real(i_res); res_i *= imag(i_res); } // note that reduction over STL data types is not yet supported res = complex(res_r, res_i) * z; cout << "Using " << omp_get_max_threads() << " threads and " << N << " terms, "; cout << "the value of sin(" << z << ") is " << res << ".\n"; cout << "This compares to the library value of " << sin(z) << " with an error of " << (sin(z)-res) << ".\n"; return 0; } !g++ -Wall -fopenmp -O2 -pedantic series.cpp -o series !./series 16 0.0 0.31415 %%file sine.cpp !g++ -Wall -fopenmp -O2 -pedantic sine.cpp -o sine !./sine 1.0471975511966 #pi/3 %%file rng_bad.cpp #include #include #include #include #include #define N 100000 #define R 1 double uniform_rng() { return (double)rand() / (double)RAND_MAX; } int main(int argc, char *argv[]) { int tid; srand(time(NULL)); // Calculate and display a few "random" numbers. #pragma omp parallel private(tid) { tid = omp_get_thread_num(); printf("%d\t%f\n", tid, uniform_rng()); } return 0; } !g++ -Wall -fopenmp -O2 -pedantic rng_bad.cpp -o rng_bad !./rng_bad %%file rng_good.cpp #include #include #include #include #include #define N 100000 #define R 1 int main(int argc, char *argv[]) { int tid; unsigned short rand_seed[3]; srand48(time(NULL)); // Calculate and display a few random numbers. #pragma omp parallel private(tid, rand_seed) { tid = omp_get_thread_num(); rand_seed[0] = tid + 120 + time(NULL); rand_seed[1] = tid * 2 + time(NULL); rand_seed[2] = tid * 100 + time(NULL); printf("%d\t%f\n", tid, erand48(rand_seed)); } return 0; } !g++ -Wall -fopenmp -O2 -pedantic rng_good.cpp -o rng_good !./rng_good %%file rng_better.cpp #include #include #include #include #include #include #define N 100000 #define R 1 int main(int argc, char *argv[]) { int tid; std::random_device rd; // non-deterministic bit generator std::mt19937 gen; // Mersenne twister URNG std::uniform_real_distribution<> dist(0,1); // transforms results to a useful distribution and range // Calculate and display a few random numbers. #pragma omp parallel private(tid, rd, gen, dist) { tid = omp_get_thread_num(); gen.seed(time(NULL)+tid); printf("%d\t%f\n", tid, dist(gen)); } return 0; } !g++ -Wall -fopenmp -O2 -pedantic rng_better.cpp -o rng_better !./rng_better # plotting code if you'd like to see RNGs v. rank. from numpy import linspace, zeros, append stdout = !./rng_better x = [] y = [] for out in stdout: x.append(float(out.split('\t')[0])) y.append(float(out.split('\t')[1])) import matplotlib as mpl import matplotlib.pyplot as plt from matplotlib import cm %matplotlib inline fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(16,4)) axes[0].plot(x, y, 'r.') axes[1].plot(zeros(len(x)),y,'r.') fig.show() %%file monte_carlo_omp.cpp #include #include #include #include #include #include #include #define NPTS 10 using namespace std; // Termination criteria double rtol = 1e-2; // Relative tolerance long max_trials = 1e6; // Maximum number of MC trials long num_batch = 500; // Number of batches per trial /** is_converged() * Use adaptive error estimation to terminate the simulation when the error bars * at one standard deviation are below rtol. */ bool is_converged(double sum_x, // Sum of x double sum_x2, // Sum of x, squared double num_trials) { // Number of trials double E_x = sum_x / num_trials; double E_x2 = sum_x2 / num_trials; double var_x = E_x2 - E_x * E_x; return (var_x/(E_x*E_x) < rtol*rtol || num_trials > max_trials); } int main(int argc, char** argv) { int num_threads = omp_get_max_threads(); double E_x, E_x2, sigma_x; // Monte Carlo result variables double all_sum_x = 0; double all_sum_x2 = 0; long all_ntrials = 0; // Private state variables int f_done; // Flag for completion int tid, // ID of this OpenMP thread tnbatch; // double sum_x, // sum_x2; // long seed; // Unique seed for Mersenne twister PRNG std::random_device rd; // non-deterministic bit generator std::mt19937 gen; // Mersenne twister URNG std::uniform_real_distribution<> dist(0,1); // transforms results to a useful distribution and range #pragma omp parallel shared(all_sum_x, all_sum_x2, all_ntrials, num_batch) private(gen, dist, seed, f_done, tid, sum_x, sum_x2) { tnbatch = num_batch; tid = omp_get_thread_num(); f_done = false; #pragma omp critical seed = static_cast(std::time(0)) + tid; gen.seed(seed); do { // Run batch of experiments. sum_x = 0; sum_x2 = 0; for (int t = 0; t < tnbatch; ++t) { double x = dist(gen); double xx[2][NPTS]; double d2 = -1; for (int i = 0; i < NPTS; i++) { double x_i = dist(gen); double y_i = dist(gen); xx[0][i] = x_i; xx[1][i] = y_i; // Assess distance between points. for (int j = 0; j < i; j++) { double d_x_j = xx[0][j] - x_i; double d_y_j = xx[1][j] - y_i; double d_ij2 = d_x_j*d_x_j + d_y_j*d_y_j; if (d2 < 0 || d_ij2 < d2) d2 = d_ij2; } } x = sqrt(d2); sum_x += x; sum_x2 += x*x; } // Update global counts and test for termination. #pragma omp critical { f_done = (f_done || is_converged(all_sum_x, all_sum_x2, all_ntrials)); all_sum_x += sum_x; all_sum_x2 += sum_x2; all_ntrials += tnbatch; f_done = (f_done || is_converged(all_sum_x, all_sum_x2, all_ntrials)); } } while (!f_done); } // Compute expected value and error bars at one standard deviation. E_x = all_sum_x / all_ntrials; E_x2 = all_sum_x2 / all_ntrials; sigma_x = sqrt((E_x2-E_x*E_x) / all_ntrials); // Output the value and error bar. printf("With %d threads, the result was $\\bar{x}$ = %f and $\\sigma_{x}$ = %f.\n", num_threads, E_x, sigma_x); return 0; } //rewrite this using TINA? using thread lock on master RNG? _both_ !g++ -Wall -fopenmp -O2 -pedantic monte_carlo_omp.cpp -o monte_carlo_omp !./monte_carlo_omp %%file reduce.cpp #include #include #include using namespace std; struct force_t { double x, y; }; int main(int argc, char** argv) { #pragma omp declare reduction (+ : force_t : omp_out.x += omp_in.x, omp_out.y += omp_in.y) force_t pt; int N = 10; if (argc > 1) N = atoi(argv[1]); #pragma omp parallel for reduction(+ : pt) schedule(static,1) for (int j = 0; j < N; j++) { force_t my_pt; my_pt.x = 1.0*j; my_pt.y = 2.0*j; pt.x += my_pt.x; pt.y += my_pt.y; } cout << "Using " << omp_get_max_threads() << " threads, the resultant force is (" << pt.x << "," << pt.y << ").\n"; } !g++-4.9 -Wall -fopenmp -O2 -std=c++11 -pedantic reduce.cpp -o reduce !./reduce 32 %%file reduce-complex.cpp !g++ -Wall -fopenmp -O2 -pedantic reduce-complex.cpp -o reduce-complex !./reduce-complex 32 %%file series-time.cpp #include #include #include #include using namespace std; #define PI 3.1415926535 // The k-th term of the infinite product series for sine. static complex term(int k, complex z) { return (complex(1.0, 0.0) - pow(z, 2)/(pow(k, 2) * PI*PI)); } int main(int argc, char** argv) { int N = 10; if (argc > 1) N = atoi(argv[1]); complex z = complex(PI*0.25, PI*0.25); if (argc > 2) z = complex(atof(argv[2]), atof(argv[3])); double ser_start, ser_end, par_start, par_end; complex res, i_res; double res_r = 1.0, res_i = 1.0; int j; // Solve the problem serially first. ser_start = omp_get_wtime(); for (j = 1; j < N; j++) { i_res = term(j, z); res_r *= real(i_res); res_i *= imag(i_res); } // note that reduction over STL data types is not yet supported res = complex(res_r, res_i) * z; ser_end = omp_get_wtime(); cout << "The serial code takes " << (ser_end - ser_start) << " s to get the answer " << res << ".\n"; // Solve the parallel problem. par_start = omp_get_wtime(); #pragma omp parallel for private(i_res, j) reduction(*:res_r,res_i) schedule(static,1) for (j = 1; j < N; j++) { i_res = term(j, z); res_r *= real(i_res); res_i *= imag(i_res); } // note that reduction over STL data types is not yet supported res = complex(res_r, res_i) * z; par_end = omp_get_wtime(); cout << "The parallel code takes " << (par_end - par_start) << " s to get the answer " << res << " with " << omp_get_max_threads() << " threads.\n"; } !g++ -Wall -fopenmp -O2 -pedantic series-time.cpp -o series-time !./series-time 1 !./series-time 10 !./series-time 100 !./series-time 1000 !./series-time 10000 !./series-time 100000 !./series-time 1000000 !./series-time 10000000 !./series-time 100000000 import numpy as np from numpy import pi, cos, linspace import matplotlib as mpl import matplotlib.pyplot as plt %matplotlib inline mpl.rcParams['figure.figsize']=[18,3] x = linspace(pi-0.05, 2.5*pi+0.02, 1001) f = cos(x)/(2+cos(x)) f[0] = f[-1] = 0 fig = plt.figure() ax = fig.add_subplot(111) ax.fill(x, f, color='cornflowerblue') ax.set_xlim(pi, 2.5*pi) ax.set_ylim(-1.0, 1.0) unit = 0.5 x_tick = np.arange(1, 2.5+unit, unit) x_label= [r"$\pi$", r"$\frac{3\pi}{2}$", r"$2\pi$", r"$\frac{5\pi}{2}$"] ax.set_xticks(x_tick*np.pi) ax.set_xticklabels(x_label, fontsize=16) plt.show() x = linspace(0, 2.02, 1001) f = np.sinh(x) / np.cosh(x) - 1.0 f[0] = f[-1] = 0 fig = plt.figure() ax = fig.add_subplot(111) ax.fill(x, f, color='maroon') ax.set_xlim(0.0, 2.0) ax.set_ylim(-1.2,0.2) plt.show() %%file reduce-complex.cpp #include #include #include #include int main(int argc, char** argv) { #pragma omp declare reduction (+ : std::complex : omp_out += omp_in) int N = 10; if (argc > 1) N = atoi(argv[1]); std::complex val; #pragma omp parallel for reduction(+ : val) schedule(dynamic,1) for (int j = 0; j < N; j++) { std::complex my_val = std::complex(2.0,2.0); val += my_val; } std::cout << "Using " << omp_get_max_threads() << " threads, the summed complex value is " << val.real() << "+" << val.imag() << "i.\n"; }