%env JAVA_HOME=/usr/lib/jvm/java-7-openjdk-amd64
%matplotlib inline
from __future__ import print_function, division
from pyspark import SparkConf
from pyspark import SparkContext
from StringIO import StringIO
import matplotlib.pyplot as plt
import numpy as np
from pprint import pprint
conf = SparkConf()
conf.set('spark.executor.instances', 10)
sc = SparkContext()
env: JAVA_HOME=/usr/lib/jvm/java-7-openjdk-amd64
example = sc.pickleFile('hdfs:///t1/map_each_image/measures').take(1)[0]
print("Keys:", example[1].keys())
print("Centroids in 1 image flattened:", example[1]['cen'])
print("Histogram flattened:", example[1]['histo'])
print("Perceptive hash (abbrev.):", example[1]['phash'][:5])
print("Ward cluster hash (abbrev.):", example[1]['ward'][:5])
print('PCA factors and variance', example[1]['pca_fac'], example[1]['pca_var'])
Keys: ['cen', 'pca_var', 'meta', 'phash', 'histo', 'ward', 'pca_fac', 'id'] Centroids in 1 image flattened: [ 0.08129681 0.07577123 0.04393618 0.52625495 0.42874384 0.39424273 0.98807454 0.85518652 0.70573205 0.38849056 0.35746324 0.16466478 0.42599285 0.61729616 0.15119481 0.69050598 0.57329297 0.58296841 0.30381802 0.25944528 0.25707352 0.15910684 0.13810225 0.08577942 0.5084554 0.45283014 0.54034466 0.24065548 0.21027599 0.12343728 0.38219753 0.33428988 0.36679208 0.87849122 0.69335157 0.61909974] Histogram flattened: [ 0.06662314 0.11274848 0.16139705 0.30679092 0.44019818 0.97807258 0.06343307 0.10054344 0.13846138 0.27710024 0.51907748 0.79767239 0.03020411 0.05431626 0.07841869 0.14961122 0.3458291 0.67084092] Perceptive hash (abbrev.): [-4759284544195859710, -7410311068947615587, -2379457584108749464, 9090031768008511141, 6956508733075400970] Ward cluster hash (abbrev.): (-5013803343762532022, -5013803343762532022, 3031575136247091488, 7497007922500697532, -2699164510210933549) PCA factors and variance [-0.64461267 -0.5783459 -0.49999053 0.01158595 -0.66131691 0.75001714 -0.76442153 0.47767769 0.43299395] [ 0.1428409 0.01249751 0.00195216]
print('Kmeans cluster to perceptive hash')
pprint(sc.pickleFile("hdfs:///t1/km/cluster_to_phash").take(2))
print('Kmeans cluster to Ward cluster hash')
pprint(sc.pickleFile("hdfs:///t1/km/cluster_to_phash").take(2))
Kmeans cluster to perceptive hash [(0, (-4759284544195859710, -7410311068947615587)), (0, (-7410311068947615587, -2379457584108749464))] Kmeans cluster to Ward cluster hash [(0, (-4759284544195859710, -7410311068947615587)), (0, (-7410311068947615587, -2379457584108749464))]
print('Perceptive hash to kmeans cluster')
pprint(sc.pickleFile("hdfs:///t1/km/phash_to_cluster").take(2))
print("Ward cluster hash to kmeans cluster")
pprint(sc.pickleFile("hdfs:///t1/km/ward_to_cluster").take(2))
Perceptive hash to kmeans cluster [((-4759284544195859710, -7410311068947615587), 0), ((-7410311068947615587, -2379457584108749464), 0)] Ward cluster hash to kmeans cluster [(-5013803343762532022, 0), (-5013803343762532022, 0)]
clust0_ward = sc.pickleFile('hdfs:////t1/km/ward_unions').take(1)[0]
pprint({k:v for k,v in clust0_ward.items()})
{-9039818526033544930: 1, -8968478298841560134: 1, -8784979837131872929: 2, -8238206338916862869: 1, -8097910731439140361: 1, -7977916611896812490: 1, -7651551579179140278: 1, -7547510319790098486: 1, -7146024615841089891: 1, -6951143412860221335: 1, -6696910335962296665: 3, -6683838510609320807: 1, -6439366262116039212: 2, -6380794332220625616: 1, -6371097860695599930: 1, -6262689646983250285: 1, -6207253714907588360: 1, -5930730968449722594: 1, -5892100271649265087: 1, -5479837740446289100: 1, -5460220511711024102: 1, -5417726837777902769: 1, -5130029528390688653: 2, -5013803343762532022: 3, -4669024366043471636: 1, -4148630240740491025: 1, -4035475343318357777: 1, -3831614152369362579: 2, -3537998416777903725: 1, -2885615042689910024: 1, -2699164510210933549: 2, -2641242543027661281: 1, -2529486012270897414: 1, -2115443718477647539: 1, -2081587739269104843: 1, -2047413555851697388: 1, -1882888838447640072: 1, -993185207907190850: 2, -954979497355127575: 1, -913044934656171724: 1, -822478295673647710: 1, -503530645212927693: 1, 196458662987777680: 1, 986794119644918572: 1, 1144850855779553720: 1, 1472601450558387876: 1, 1605200551941798738: 1, 1812673853944481358: 1, 1838417486818896428: 1, 2523860936357694907: 1, 2695353490322839954: 1, 2810012508846950117: 3, 3031575136247091488: 1, 3203440807360164552: 1, 3457338316476294871: 1, 4302660866798555855: 1, 4797748490654898946: 1, 5235023410939091882: 1, 5692573854726051135: 1, 5942326543197887389: 1, 6180127811751478618: 1, 6655231192280505930: 1, 6773348016712109053: 1, 7107395565539596032: 1, 7497007922500697532: 2, 7497017561653154532: 1, 7544431165752988482: 3, 8118683264794037994: 1, 8195333527713462377: 1, 8526517616553538725: 1, 8565268080245686265: 1, 8788672895261669645: 1, 8871311509876279992: 1}
print('ward_to_key\n')
pprint(sc.pickleFile('hdfs:///t1/km/ward_to_key').take(2))
print('\n\nphash_to_key\n')
pprint(sc.pickleFile('hdfs:///t1/km/phash_to_key').take(2))
ward_to_key [(-5013803343762532022, u'hdfs://ip-10-123-209-104:9000/imgs/femalegotoneelduns.jpg'), (-5013803343762532022, u'hdfs://ip-10-123-209-104:9000/imgs/femalegotoneelduns.jpg')] phash_to_key [((-4759284544195859710, -7410311068947615587), u'hdfs://ip-10-123-209-104:9000/imgs/femalegotoneelduns.jpg'), ((-7410311068947615587, -2379457584108749464), u'hdfs://ip-10-123-209-104:9000/imgs/femalegotoneelduns.jpg')]
ward_matches = sc.pickleFile('hdfs:///t1/candidates/c1/ward_to_key_counts')
phash_matches = sc.pickleFile('hdfs:///t1/candidates/c1/phash_to_key_counts')
wm = ward_matches.collect()
pm = phash_matches.collect()
joined = ward_matches.fullOuterJoin(phash_matches)
def best_votes(x):
if x[1][0] is None:
d1 = {}
else:
d1 = x[1][0][1]
if x[1][1] is None:
d2 = {}
else:
d2 = x[1][1][1]
for k in d1:
if k not in d2:
d2[k] = d1[k]
else:
d2[k] += d1[k]
d3 = sorted(d2.items(), key=lambda x:x[1])[-1]
return x[0],(d3,d2)
phash_and_ward = joined.map(best_votes).collect()
print("Example votes dictionary:")
pprint(phash_and_ward[0])
Example votes dictionary: (u'hdfs://ip-10-123-209-104:9000/fuzzy/male93384629338462.10.jpg', ((u'hdfs://ip-10-123-209-104:9000/imgs/male93384629338462.10.jpg', 40670), {u'hdfs://ip-10-123-209-104:9000/imgs/femalegotoneelduns.jpg': 19040, u'hdfs://ip-10-123-209-104:9000/imgs/femalevstrosvstros.17.jpg': 7840, u'hdfs://ip-10-123-209-104:9000/imgs/male93384629338462.10.jpg': 40670, u'hdfs://ip-10-123-209-104:9000/imgs/male93384899338489.19.jpg': 25760, u'hdfs://ip-10-123-209-104:9000/imgs/maleajflemajflem.5.jpg': 33600, u'hdfs://ip-10-123-209-104:9000/imgs/maledcbowedcbowe.20.jpg': 8400, u'hdfs://ip-10-123-209-104:9000/imgs/malegdhatcgdhatc.20.jpg': 7840, u'hdfs://ip-10-123-209-104:9000/imgs/maleggregoggrego.1.jpg': 3360, u'hdfs://ip-10-123-209-104:9000/imgs/malejabinsjabins.5.jpg': 11200, u'hdfs://ip-10-123-209-104:9000/imgs/malesaduahsaduah.11.jpg': 9520, u'hdfs://ip-10-123-209-104:9000/imgs/malestaffmoorsmoors.13.jpg': 9520, u'hdfs://ip-10-123-209-104:9000/imgs/malestaffmoorsmoors.14.jpg': 17920, u'hdfs://ip-10-123-209-104:9000/imgs/malestaffobeidnobeidn.14.jpg': 33040, u'hdfs://ip-10-123-209-104:9000/imgs/malesvkrizsvkriz.17.jpg': 10640}))
def load_image(image):
"""Load one image, where image = (key, blob)"""
from StringIO import StringIO
from PIL import Image
img_quads = []
img = Image.open(StringIO(image[1]))
return image[0], np.asarray(img, dtype=np.uint8)
for p in phash_and_ward:
cname, candidate = load_image(sc.binaryFiles(p[0]).collect()[0])
mname, matched = load_image(sc.binaryFiles(p[1][0][0]).collect()[0])
print("Candidate (fuzzy) " , cname)
plt.subplot(1,2,1)
plt.imshow(candidate)
print("Matched (original) " ,mname)
plt.subplot(1,2,2)
plt.imshow(matched)
plt.show()
Candidate (fuzzy) hdfs://ip-10-123-209-104:9000/fuzzy/male93384629338462.10.jpg Matched (original) hdfs://ip-10-123-209-104:9000/imgs/male93384629338462.10.jpg
Candidate (fuzzy) hdfs://ip-10-123-209-104:9000/fuzzy/malesaduahsaduah.11.jpg Matched (original) hdfs://ip-10-123-209-104:9000/imgs/malesaduahsaduah.11.jpg
Candidate (fuzzy) hdfs://ip-10-123-209-104:9000/fuzzy/malejabinsjabins.5.jpg Matched (original) hdfs://ip-10-123-209-104:9000/imgs/malejabinsjabins.5.jpg
Candidate (fuzzy) hdfs://ip-10-123-209-104:9000/fuzzy/maleajflemajflem.5.jpg Matched (original) hdfs://ip-10-123-209-104:9000/imgs/maleajflemajflem.5.jpg
Candidate (fuzzy) hdfs://ip-10-123-209-104:9000/fuzzy/malestaffmoorsmoors.13.jpg Matched (original) hdfs://ip-10-123-209-104:9000/imgs/malestaffmoorsmoors.13.jpg
Candidate (fuzzy) hdfs://ip-10-123-209-104:9000/fuzzy/male93384899338489.19.jpg Matched (original) hdfs://ip-10-123-209-104:9000/imgs/male93384899338489.19.jpg
Candidate (fuzzy) hdfs://ip-10-123-209-104:9000/fuzzy/femalevstrosvstros.17.jpg Matched (original) hdfs://ip-10-123-209-104:9000/imgs/femalevstrosvstros.17.jpg
Candidate (fuzzy) hdfs://ip-10-123-209-104:9000/fuzzy/malegdhatcgdhatc.20.jpg Matched (original) hdfs://ip-10-123-209-104:9000/imgs/malegdhatcgdhatc.20.jpg
Candidate (fuzzy) hdfs://ip-10-123-209-104:9000/fuzzy/femalegotoneelduns.jpg Matched (original) hdfs://ip-10-123-209-104:9000/imgs/femalegotoneelduns.jpg
Candidate (fuzzy) hdfs://ip-10-123-209-104:9000/fuzzy/maledcbowedcbowe.20.jpg Matched (original) hdfs://ip-10-123-209-104:9000/imgs/malestaffobeidnobeidn.14.jpg
Candidate (fuzzy) hdfs://ip-10-123-209-104:9000/fuzzy/maleggregoggrego.1.jpg Matched (original) hdfs://ip-10-123-209-104:9000/imgs/maleggregoggrego.1.jpg
Candidate (fuzzy) hdfs://ip-10-123-209-104:9000/fuzzy/malesvkrizsvkriz.17.jpg Matched (original) hdfs://ip-10-123-209-104:9000/imgs/malesvkrizsvkriz.17.jpg
Candidate (fuzzy) hdfs://ip-10-123-209-104:9000/fuzzy/malestaffmoorsmoors.14.jpg Matched (original) hdfs://ip-10-123-209-104:9000/imgs/malestaffmoorsmoors.14.jpg
Candidate (fuzzy) hdfs://ip-10-123-209-104:9000/fuzzy/malestaffobeidnobeidn.14.jpg Matched (original) hdfs://ip-10-123-209-104:9000/imgs/malestaffobeidnobeidn.14.jpg
config = {'actions': ['map_each_image', 'kmeans', 'find_similar'],
'candidate_batch': 'c1',
'candidate_has_mapped': False,
'candidate_measures_spec': '/t1/candidates/c1/measures',
'candidate_spec': '/fuzzy/*',
'example_data': '/imgs/',
'fuzzy_example_data': '/fuzzy/',
'in_memory_set_len': 8000000,
'input_spec': '/imgs/*',
'kmeans_group_converge': 10000,
'kmeans_output': {'cluster_to_flattened': True,
'cluster_to_key': True,
'cluster_to_phash': True,
'cluster_to_ward': True,
'flattened_to_cluster': True,
'flattened_to_key': True,
'flattened_to_phash': True,
'key_to_cluster': True,
'key_to_phash': True,
'phash_to_cluster': True,
'phash_to_flattened': True,
'phash_to_key': True,
'ward_to_cluster': True,
'ward_to_key': True},
'kmeans_sample': 2000,
'maxIterations': 15,
'max_iter_group': 10,
'n_clusters': 12,
'n_clusters_group': 8,
'patch': {'max_patches': 4,
'random_state': 0,
'window_as_fraction': [0.5, 0.5]},
'phash_bits': 128,
'phash_chunk_len': 2,
'quantiles': [5, 15, 25, 50, 75, 95],
'search_rounds': 1,
'search_sample_step': 100,
'test_name': 't1',
'ward_clusters': 8,
'ward_x_down': 16,
'x_down': 32}