Image(r'C:\Users\rcreedon\Dropbox\Rory Notes\Notes\JellyFish\StringStrategy\clusters.png') import pandas as pd import numpy as np import datetime import os from pandas import DataFrame from numpy import nan as NA from IPython.core.display import HTML from IPython.core.display import Image from IPython.display import Math from IPython.display import Latex import collections import jellyfish as jf import re import random import itertools %qtconsole def SLINK(SList, Threshold): #1 random.shuffle(SList) Clusters = [] Centroid = [] Scores = [] for string in SList: SPScores = [] Matched = 0 #2 if len(Clusters) == 0: Clusters.append([string]) Centroid.append([string]) Scores.append([]) continue #3 for ClustNum in xrange(len(Clusters)): Dist = jf.jaro_distance(string, Centroid[ClustNum][0]) SPScores.append(Dist) #4 MaxVal = max(SPScores) MaxInd = SPScores.index(max(SPScores)) #5 if MaxVal >= Threshold: Clusters[MaxInd].append(string) #6 if len(Scores[MaxInd]) == 0: Scores[MaxInd].append(MaxVal) else: #7 if MaxVal > Scores[MaxInd][0]: Scores[MaxInd][0] = MaxVal Centroid[MaxInd][0] = string Matched = 1 #8 if Matched ==0: Clusters.append([string]) Centroid.append([string]) Scores.append([]) return Clusters Styles = list(pd.Series(pd.read_csv(r'C:\Users\rcreedon\Dropbox\GIZSupervisor\DATA\Production_Data\STP_Data\Data_Sets\Wave1\1005\1005_all_merged.csv')['style'].unique())) del Styles[Styles.index(np.nan)] Styles = [re.sub('[^A-Za-z0-9 ' ']+', '', style) for style in Styles] Styles = [style.lower() for style in Styles] Styles = [" ".join(style.split()) for style in Styles] Styles Clusters = SLINK(Styles, 0.8) Clusters.sort(lambda x,y: cmp(len(y), len(x))) Clusters class Stripped: 'Common base class for all stripped stings' def __init__(self, original, GenericAll, GenericWhite = None, DelWhite = False): # Class attribute that is the string in its original format self.original = original StrVal = original.lower() StrVal = re.sub('[^A-Za-z0-9 ' ']+', ' ', StrVal) #strip out all occurences of sub-strings from GenericAll list that appear anywhere in the string for word in GenericAll: RegEx1 = re.compile('' + word) StrVal = re.sub(RegEx1, '', StrVal) # If provided as argument strip out all occurences of sub-string when that sub string is surrounded by # whitespace (i.e. is not part of another substring-sequence) if not GenericWhite == None: for word in GenericWhite: RegEx2 = re.compile(r'\b' + word + r'\b') StrVal = re.sub(RegEx2, '', StrVal) # Removes special characters, removes all whitespace if DelWhite == True: StrVal = StrVal.replace(' ', '') # Class attribute that is the stipped string self.stripped = StrVal def SlinkSC(ClassList, Threshold): #1 random.shuffle(ClassList) Clusters = [] ClustersStripped = [] Centroid = [] Scores = [] for StrippedClass in ClassList: SPScores = [] Matched = 0 if len(Clusters) == 0: Clusters.append([StrippedClass.original]) ClustersStripped.append([StrippedClass.stripped]) Centroid.append([StrippedClass.stripped, StrippedClass.original]) Scores.append([]) continue for ClustNum in xrange(len(Clusters)): Dist = jf.jaro_distance(StrippedClass.stripped, Centroid[ClustNum][0]) SPScores.append(Dist) MaxVal = max(SPScores) MaxInd = SPScores.index(max(SPScores)) if MaxVal >= Threshold: Clusters[MaxInd].append(StrippedClass.original) ClustersStripped[MaxInd].append(StrippedClass.stripped) if len(Scores[MaxInd]) == 0: Scores[MaxInd].append(MaxVal) else: if MaxVal > Scores[MaxInd][0]: Scores[MaxInd][0] = MaxVal Centroid[MaxInd][0] = StrippedClass.stripped Centroid[MaxInd][1] = StrippedClass.original Matched = 1 if Matched ==0: Clusters.append([StrippedClass.original]) ClustersStripped.append([StrippedClass.stripped]) Centroid.append([StrippedClass.stripped, StrippedClass.original]) Scores.append([]) return [Clusters, ClustersStripped, Centroid] Styles = list(pd.Series(pd.read_csv(r'C:\Users\rcreedon\Dropbox\GIZSupervisor\DATA\Production_Data\STP_Data\Data_Sets\Wave1\1005\1005_all_merged.csv')['style'].unique())) del Styles[Styles.index(np.nan)] Styles = [re.sub('[^A-Za-z0-9 ' ']+', '', style) for style in Styles] Styles = [style.lower() for style in Styles] Styles = [" ".join(style.split()) for style in Styles] WordDict = {} for style in Styles: for word in style.split(' '): if word not in WordDict: WordDict[word] = 1 else: WordDict[word] +=1 for word, value in WordDict.iteritems(): if value > 1: print word, value GenericAll = ['denim', 'jkt', 'chino', 'short', 'jacket'] ClassList = [Stripped(elem, GenericAll, DelWhite = True) for elem in Styles] Clustered = SlinkSC(ClassList, 0.8) ClustersOriginal = Clustered[0] ClustersOriginal.sort(lambda x,y: cmp(len(y) , len(x))) ClustersStripped = Clustered[1] ClustersStripped.sort(lambda x,y: cmp(len(y) , len(x))) #Original Clusters ClustersOriginal[:8] #Stripped Clusers ClustersStripped[:8] Bstyles = list(pd.read_csv('ExampleData.csv')['style'].unique()) GenericRemoveAll = ['bottom', 'boys', 'long', 'topb', 'tank', 'basic', 'polo', 'shorts', 'ssivts', \ 'top', 'sslvts', 'mens', 'nightware', 'lslvts', 'nightwear', 'msp', 'lsivts', 'tee', 'large', 'slv'] GenericWhiteSpace = ['l', 'pj', 's'] # uses list comprehension BClassList = [Stripped(elem, GenericRemoveAll, GenericWhiteSpace, True) for elem in Bstyles] BClustersSC = SlinkSC(BClassList, 0.85) # Give separate names to each list in the ClusterSC object BClustersOriginal = BClustersSC[0] BClustersOriginal.sort(lambda x,y: cmp(len(y) , len(x))) BClustersStripped = BClustersSC[1] BClustersStripped.sort(lambda x,y: cmp(len(y) , len(x))) BClustersOriginal[:5] BClustersStripped[:5] BLClustersSC = SlinkSC(BClassList, 0.95) # Give separate names to each list in the ClusterSC object BLClustersOriginal = BClustersSC[0] BLClustersOriginal.sort(lambda x,y: cmp(len(y) , len(x))) BLClustersStripped = BClustersSC[1] BLClustersStripped.sort(lambda x,y: cmp(len(y) , len(x))) BLClustersOriginal[:5] # Number of clusters with threshold at 0.85 print len([elem for elem in BClustersOriginal if len(elem) > 1]) # Number of clusters with threshold at 0.95 print len([elem for elem in BLClustersOriginal if len(elem) > 1]) DF = DataFrame(pd.read_csv('ExampleData.csv')) #Example 1 for style in BClustersOriginal[1]: print DF[DF.style == style]['smv'].unique() print BClustersOriginal[1] print BClustersStripped[1] #Example 2 for style in BLClustersOriginal[0]: print DF[DF.style == style]['smv'].unique() def maxClusters(ClassList, startThreshold, stopThreshold, step): Threshold = startThreshold ConsistentClusters = [] while Threshold <= stopThreshold: ConsistentCount = 0 TotClusters = SlinkSC(ClassList, Threshold) MultiClusters = [elem for elem in TotClusters[0] if len(elem) > 1] for elem in MultiClusters: MultiSMV = list(itertools.chain(*[DF[DF.style == style]['smv'].unique() for style in elem])) if len(set(MultiSMV)) == 1: ConsistentCount +=1 ConsistentClusters.append(ConsistentCount) Threshold += step return ConsistentClusters print maxClusters(BClassList, 0.7, 0.95, 0.025) print maxClusters(BClassList, 0.7, 0.95, 0.025) print maxClusters(BClassList, 0.7, 0.95, 0.025) Optimum = [] for x in xrange(10): L = maxClusters(BClassList, 0.7, 0.95, 0.025) Ind = L.index(max(L)) Val = 0.7 + (Ind*0.025) Optimum.append(Val) Optimum Optimum2 = [] for x in xrange(10): L = maxClusters(BClassList, 0.82, 0.88, 0.01) Ind = L.index(max(L)) Val = 0.82 + (Ind*0.01) Optimum2.append(Val) print Optimum2 print np.mean(Optimum2) def IdentifyClusters(ClassList, Threshold, df, col1, col2): ClustersDict = {} Clusters = SlinkSC(ClassList, Threshold) ClustersOriginal = Clusters[0] ClustersCentroid = Clusters[2] IndList = [x for x in xrange(len(ClustersOriginal)) if len(ClustersOriginal[x]) > 1] MultiClusters = [ClustersOriginal[Ind] for Ind in IndList] MultiCentroid = [ClustersCentroid[Ind] for Ind in IndList] for cluster in xrange(len(MultiClusters)): MultiSMV = list(itertools.chain(*[df[df[col1] == elem][col2].unique() for elem in MultiClusters[cluster]])) if len(set(MultiSMV)) == 1: ClustersDict[MultiCentroid[cluster][1]] = MultiClusters[cluster] return ClustersDict MatchedDict = IdentifyClusters(BClassList, 0.85, DF, 'style', 'smv') len(MatchedDict) Count = 0 for key, value in MatchedDict.iteritems(): for elem in value: Count +=1 Count for Key in MatchedDict.keys()[:10]: print 'The cluster name is ' + Key print MatchedDict[Key] print ForFailed = SlinkSC(BClassList, 0.85) Potential = [elem for elem in ForFailed[0] if len(elem) > 0] # Print a selection of the failed clusters for elem in Potential[:15]: SMVs = list(itertools.chain(*[DF[DF.style == style]['smv'].unique() for style in elem])) if len(set(SMVs)) != 1: print elem print SMVs print def ClustersRemainders(ClassList, Threshold, df, col1, col2): #1 ClustersDict = {} Clusters = SlinkSC(ClassList, Threshold) ClustersOriginal = Clusters[0] ClustersCentroid = Clusters[2] #2 IndList = [x for x in xrange(len(ClustersOriginal)) if len(ClustersOriginal[x]) > 1] MultiClusters = [ClustersOriginal[Ind] for Ind in IndList] MultiCentroid = [ClustersCentroid[Ind] for Ind in IndList] #3 for cluster in xrange(len(MultiClusters)): MultiSMV = list(itertools.chain(*[df[df[col1] == elem][col2].unique() for elem in MultiClusters[cluster]])) if len(set(MultiSMV)) == 1: ClustersDict[MultiCentroid[cluster][1]] = MultiClusters[cluster] else: #4 if len(MultiSMV) == len(MultiClusters[cluster]): for smv in list(set(MultiSMV)): if MultiSMV.count(smv) >= 2: BoolList = [True if elem == smv else False for elem in MultiSMV] StrList = [MultiClusters[cluster][x] for x in xrange(len(BoolList)) if BoolList[x] == True] StrList.sort(lambda x, y: cmp(len(x), len(y))) ClustersDict[StrList[0]] = StrList return ClustersDict MatchedDict2 = ClustersRemainders(BClassList, 0.85, DF, 'style', 'smv') len(MatchedDict2) Count = 0 for key, value in MatchedDict2.iteritems(): for elem in value: Count +=1 Count StringMap = {} for key, value in MatchedDict2.iteritems(): for elem in value: StringMap[elem] = key len(DF.style.unique()) DF['style_new'] = DF.style for row in DF.index: try: DF['style_new'][row] = StringMap[DF.style[row]] except KeyError: pass len(DF.style_new.unique()) SecondPass = list(DF.style_new.unique()) SecondClassList = [Stripped(elem, GenericRemoveAll, GenericWhiteSpace, DelWhite = True) for elem in SecondPass] SecondClusters = SlinkSC(SecondClassList, 0.85) # Give separate names to each list in the ClusterSC object SecondClusOrig = BClustersSC[0] SecondClusOrig.sort(lambda x,y: cmp(len(y) , len(x))) SecondClusStrip = BClustersSC[1] SecondClusStrip.sort(lambda x,y: cmp(len(y) , len(x))) SecondMatchedDict = ClustersRemainders(SecondClassList, 0.85, DF, 'style', 'smv') len(SecondMatchedDict) Count = 0 for key, value in SecondMatchedDict.iteritems(): for elem in value: Count +=1 Count for key, value in SecondMatchedDict.iteritems(): print key, value print def ClustersRecursive(Threshold, df, col1, col2, GenericAll, GenericWhite = None, DelWhite = False): Styles = df[col1].unique() ClassList = [Stripped(style, GenericAll, GenericWhite, DelWhite) for style in Styles] ClustersDict = {} Clusters = SlinkSC(ClassList, Threshold) ClustersOriginal = Clusters[0] ClustersCentroid = Clusters[2] IndList = [x for x in xrange(len(ClustersOriginal)) if len(ClustersOriginal[x]) > 1] MultiClusters = [ClustersOriginal[Ind] for Ind in IndList] MultiCentroid = [ClustersCentroid[Ind] for Ind in IndList] if len(MultiClusters) == 0: print 'Finished1' return else: Counter = 0 for cluster in xrange(len(MultiClusters)): MultiSMV = list(itertools.chain(*[df[df[col1] == elem][col2].unique() for elem in MultiClusters[cluster]])) if len(set(MultiSMV)) == 1: ClustersDict[MultiCentroid[cluster][1]] = MultiClusters[cluster] Counter +=1 else: if len(MultiSMV) == len(MultiClusters[cluster]): for smv in list(set(MultiSMV)): if MultiSMV.count(smv) >= 2: BoolList = [True if elem == smv else False for elem in MultiSMV] StrList = [MultiClusters[cluster][x] for x in xrange(len(BoolList)) if BoolList[x] == True] StrList.sort(lambda x, y: cmp(len(x), len(y))) ClustersDict[StrList[0]] = StrList Counter +=1 StringMap = {} for key, value in ClustersDict.iteritems(): for elem in value: StringMap[elem] = key if Counter == 0: return else: for row in DF.index: try: df[col1][row] = StringMap[df[col1][row]] except KeyError: pass ClustersRecursive(Threshold, df, col1, col2, GenericAll, GenericWhite = None, DelWhite = False) DF['style_new2'] = DF.style ClustersRecursive(0.85, DF, 'style_new2', 'smv', GenericRemoveAll, GenericWhiteSpace, True) len(DF.style_new2.unique()) LenList = [] for x in xrange(20): DF['style_new2'] = DF.style ClustersRecursive(0.85, DF, 'style_new2', 'smv', GenericRemoveAll, GenericWhiteSpace, True) LenList.append(len(DF.style_new2.unique())) print LenList