import numpy as np
from IPython.parallel import Client
import matplotlib.pyplot as plt
from matplotlib import colors
Read data, define target string, transform it to numpy array
f = open('pi-billion.txt')
pi_string = f.read()
pi_string = pi_string.replace('.','')
soundcloud_string = '000003399960000003495999940001275959999710395959599999936959595999999618595959999981'
soundcloud_array = np.fromiter(soundcloud_string, int)
We'll attack the problem in chunks of given length. Function to prepare chunk partitions
def define_partitions(length, shift=0):
nclients = len(c.ids)
step = length / nclients
A = range(shift*length, (shift+1)*length, step)
partitions = [[A[i], A[i+1]+84] for i in range(len(A) - 1)]
partitions[-1][1] = length*(shift+1)
return partitions
Distance function between arrays: sum of absolute difference between cells using numpy arrays (subtract element-wise and sum absolute values). Numpy implementation of Einstein summation (einsum) more performant than standard np.sum()
def compute_clouds((a,b)):
pi_array = np.fromiter(pi_string[a:b], int)
res = [(np.einsum('i->', np.abs(soundcloud_array-pi_array[ind:ind+84])), a+ind) for ind in range(len(pi_array)-84)]
return sorted(res)[:10]
Start ipython parallel engines. Import necessary modules and global variables. Running on multicore machine with 8 nodes
c = Client()
%px import numpy as np
c[:]['soundcloud_array'] = soundcloud_array
c[:]['pi_string'] = pi_string
c.ids
[0, 1, 2, 3, 4, 5, 6, 7]
Example of partition
partitions = define_partitions(len(pi_string)/50, 3)
partitions
[[60000000, 62500084], [62500000, 65000084], [65000000, 67500084], [67500000, 70000084], [70000000, 72500084], [72500000, 75000084], [75000000, 80000000]]
For each partition (we chose 50), run distance function in parallel using ipython balance view in asynchronous mode. The outer loop with 50 iterations can be easily parallelized in a similar way on AWS
clouds = {}
bview = c.load_balanced_view()
print('Start: {}'.format(datetime.datetime.now().time()))
for s in range(50):
partitions = define_partitions(len(pi_string)/50, shift=s)
res = bview.map(compute_clouds, partitions)
clouds[s] = res.get_dict()
print('\t Done {}/50: {}'.format(s+1, datetime.datetime.now().time()))
print('End: {}'.format(datetime.datetime.now().time()))
Start: 18:01:33.557691 Done 1/50: 18:02:03.243289 Done 2/50: 18:02:27.280713 Done 3/50: 18:02:50.663592 Done 4/50: 18:03:16.893923 Done 5/50: 18:03:42.233646 Done 6/50: 18:04:08.538198 Done 7/50: 18:04:35.232792 Done 8/50: 18:04:58.775786 Done 9/50: 18:05:23.287744 Done 10/50: 18:05:48.709454 Done 11/50: 18:06:11.356334 Done 12/50: 18:06:38.789648 Done 13/50: 18:07:06.231197 Done 14/50: 18:07:32.110501 Done 15/50: 18:07:57.054639 Done 16/50: 18:08:18.864231 Done 17/50: 18:08:44.986787 Done 18/50: 18:09:08.962241 Done 19/50: 18:09:35.615003 Done 20/50: 18:10:01.280692 Done 21/50: 18:10:28.583998 Done 22/50: 18:10:55.781652 Done 23/50: 18:11:23.248102 Done 24/50: 18:11:49.039647 Done 25/50: 18:12:16.805712 Done 26/50: 18:12:42.278682 Done 27/50: 18:13:09.576268 Done 28/50: 18:13:34.535707 Done 29/50: 18:14:01.232585 Done 30/50: 18:14:25.371402 Done 31/50: 18:14:52.634643 Done 32/50: 18:15:19.112538 Done 33/50: 18:15:46.914407 Done 34/50: 18:16:13.612723 Done 35/50: 18:16:41.510302 Done 36/50: 18:17:08.265121 Done 37/50: 18:17:34.765585 Done 38/50: 18:18:00.173819 Done 39/50: 18:18:25.118257 Done 40/50: 18:18:45.643480 Done 41/50: 18:19:09.960245 Done 42/50: 18:19:36.427499 Done 43/50: 18:20:03.294402 Done 44/50: 18:20:30.645344 Done 45/50: 18:20:56.476389 Done 46/50: 18:21:23.050785 Done 47/50: 18:21:48.503384 Done 48/50: 18:22:14.784374 Done 49/50: 18:22:42.516931 Done 50/50: 18:23:09.422109 End: 18:23:09.422418
If we were running on a machine with more nodes, we could skip the outer loop altogether by running the following commented code
#partitions = define_partitions(len(pi_string))
#%time bview = c.load_balanced_view()
#%time res = bview.map(compute_clouds, partitions)
#%time clouds = res.get_dict()
Gather all results (from all iterations and all parallel engines within iteration) and sort for best 10. List of results contains value of distance function (smaller is better) and index at which the desired string of length 84 starts
results = []
for run in clouds.keys():
for cloud in clouds[run].values():
results.extend(cloud)
sorted(results)[:10]
[(180, 981165566), (187, 972165010), (188, 297640119), (192, 509863836), (193, 197342990), (193, 343577393), (193, 615021438), (193, 702340521), (194, 197342978), (194, 789652974)]
Plot results using matplotlib and a custom colormap of gray tones
def make_grid(pi_substring):
# Reshape substring to grid dimensions
nrows, ncols = 6, 14
z = np.array([int(c) for c in pi_substring]).reshape((nrows, ncols))
return z
def plot_cloud(grid):
cmap = colors.ListedColormap(['#ffffff', '#f0f0f0', '#ebebeb', '#d0d0d0', '#c1c1c1', '#a8a8a8',
'#878787', '#878787', '#535353', '#333333', '#000000'])
# Plot
ax = plt.axes()
ax.matshow(grid, cmap=cmap)
ax.set_axis_off()
def plot_clouds(results):
cmap = colors.ListedColormap(['#ffffff', '#f0f0f0', '#ebebeb', '#d0d0d0', '#c1c1c1', '#a8a8a8',
'#878787', '#878787', '#535353', '#333333', '#000000'])
fig, (ax0, ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8, ax9) = plt.subplots(10,1)
fig.set_size_inches(12,24)
fig.tight_layout()
fig.subplots_adjust(hspace=1)
for (ind, ax) in enumerate(fig.axes):
ax.matshow(make_grid(pi_string[results[ind][1]:results[ind][1]+84]), cmap=cmap)
ax.set_axis_off()
ax.set_title('Rank {}, offset {}\n Sequence {}'.format(ind+1, results[ind][1],
pi_string[results[ind][1]:results[ind][1]+84]))
plot_cloud(make_grid(soundcloud_string))
plot_clouds(sorted(results)[:10])
Comparison target string with best match
print(soundcloud_string)
print(pi_string[981165566:981165566+84])
000003399960000003495999940001275959999710395959599999936959595999999618595959999981 000053743536020032496985553181128909983810344939134894807349584729687746183109884672