Image(r'C:\Users\rcreedon\Dropbox\Rory Notes\Notes\JellyFish\StringStrategy\clusters.png')

import pandas as pd
import numpy as np
import datetime
import os
from pandas import DataFrame
from numpy import nan as NA
from IPython.core.display import HTML
from IPython.core.display import Image
from IPython.display import Math
from IPython.display import Latex
import collections
import jellyfish as jf
import re
import random
import itertools
%qtconsole

def SLINK(SList, Threshold):
    #1
    random.shuffle(SList)
    Clusters = []
    Centroid = []
    Scores = []
    
    for string in SList:     
        SPScores = []
        Matched = 0
        
        #2
        if len(Clusters) == 0:
            Clusters.append([string])
            Centroid.append([string])
            Scores.append([])
            continue
        
        #3
        for ClustNum in xrange(len(Clusters)):
            Dist = jf.jaro_distance(string, Centroid[ClustNum][0])
            SPScores.append(Dist)
        
        #4
        MaxVal = max(SPScores)
        MaxInd = SPScores.index(max(SPScores))
        
        #5
        if MaxVal >= Threshold:
            Clusters[MaxInd].append(string)
            
            #6
            if len(Scores[MaxInd]) == 0:
                Scores[MaxInd].append(MaxVal)               
            else:
                #7
                if MaxVal > Scores[MaxInd][0]:
                    Scores[MaxInd][0] = MaxVal
                    Centroid[MaxInd][0] = string    
            Matched = 1
        
        #8
        if Matched ==0:       
            Clusters.append([string])
            Centroid.append([string])
            Scores.append([])
    
    return Clusters

Styles = list(pd.Series(pd.read_csv(r'C:\Users\rcreedon\Dropbox\GIZSupervisor\DATA\Production_Data\STP_Data\Data_Sets\Wave1\1005\1005_all_merged.csv')['style'].unique()))

del Styles[Styles.index(np.nan)]
Styles = [re.sub('[^A-Za-z0-9 ' ']+', '', style) for style in Styles]
Styles = [style.lower() for style in Styles]
Styles = [" ".join(style.split()) for style in Styles]
Styles

Clusters = SLINK(Styles, 0.8)

Clusters.sort(lambda x,y: cmp(len(y), len(x)))
Clusters

class Stripped:
    'Common base class for all stripped stings'
    
    def __init__(self, original, GenericAll, GenericWhite = None, DelWhite = False):
        # Class attribute that is the string in its original format
       
        self.original = original
        StrVal = original.lower()
        StrVal = re.sub('[^A-Za-z0-9 ' ']+', ' ', StrVal)
        
        #strip out all occurences of sub-strings from GenericAll list that appear anywhere in the string
        for word in GenericAll:
            RegEx1 = re.compile('' + word)
            StrVal = re.sub(RegEx1, '', StrVal)
        
        # If provided as argument strip out all occurences of sub-string when that sub string is surrounded by 
        # whitespace (i.e. is not part of another substring-sequence)
        if not GenericWhite == None:
            for word in GenericWhite:
                RegEx2 = re.compile(r'\b' + word + r'\b')
                StrVal = re.sub(RegEx2, '', StrVal)
        
        # Removes special characters, removes all whitespace
        if DelWhite == True:
            StrVal = StrVal.replace(' ', '')
        
        # Class attribute that is the stipped string
        self.stripped = StrVal


def SlinkSC(ClassList, Threshold):
    #1
    random.shuffle(ClassList)
    
   
    Clusters = []
    ClustersStripped = []
    Centroid = []
    Scores = []
    
    for StrippedClass in ClassList:     
        SPScores = []
        Matched = 0
        
        if len(Clusters) == 0:
            Clusters.append([StrippedClass.original])
            ClustersStripped.append([StrippedClass.stripped])
            Centroid.append([StrippedClass.stripped, StrippedClass.original])
            Scores.append([])
            continue
        
        for ClustNum in xrange(len(Clusters)):
            Dist = jf.jaro_distance(StrippedClass.stripped, Centroid[ClustNum][0])
            SPScores.append(Dist)
        
        MaxVal = max(SPScores)
        MaxInd = SPScores.index(max(SPScores))
        
        if MaxVal >= Threshold:
            Clusters[MaxInd].append(StrippedClass.original)
            ClustersStripped[MaxInd].append(StrippedClass.stripped)
            
            if len(Scores[MaxInd]) == 0:
                Scores[MaxInd].append(MaxVal)               
            else:
                if MaxVal > Scores[MaxInd][0]:
                    Scores[MaxInd][0] = MaxVal
                    Centroid[MaxInd][0] = StrippedClass.stripped
                    Centroid[MaxInd][1] = StrippedClass.original
            Matched = 1
        
        if Matched ==0:       
            Clusters.append([StrippedClass.original])
            ClustersStripped.append([StrippedClass.stripped])
            Centroid.append([StrippedClass.stripped, StrippedClass.original])
            Scores.append([])
    
    return [Clusters, ClustersStripped, Centroid]

Styles = list(pd.Series(pd.read_csv(r'C:\Users\rcreedon\Dropbox\GIZSupervisor\DATA\Production_Data\STP_Data\Data_Sets\Wave1\1005\1005_all_merged.csv')['style'].unique()))
del Styles[Styles.index(np.nan)]
Styles = [re.sub('[^A-Za-z0-9 ' ']+', '', style) for style in Styles]
Styles = [style.lower() for style in Styles]
Styles = [" ".join(style.split()) for style in Styles]

WordDict = {}
for style in Styles:
    for word in style.split(' '):
        if word not in WordDict:
            WordDict[word] = 1
        else:
            WordDict[word] +=1
for word, value in WordDict.iteritems():
    if value > 1:
        print word, value

GenericAll = ['denim', 'jkt', 'chino', 'short', 'jacket']

ClassList = [Stripped(elem, GenericAll, DelWhite = True) for elem in Styles]

Clustered = SlinkSC(ClassList, 0.8)
ClustersOriginal = Clustered[0]
ClustersOriginal.sort(lambda x,y: cmp(len(y) , len(x)))
ClustersStripped = Clustered[1]
ClustersStripped.sort(lambda x,y: cmp(len(y) , len(x)))

#Original Clusters
ClustersOriginal[:8]

#Stripped Clusers
ClustersStripped[:8]

Bstyles = list(pd.read_csv('ExampleData.csv')['style'].unique())

GenericRemoveAll = ['bottom', 'boys', 'long', 'topb', 'tank', 'basic', 'polo', 'shorts', 'ssivts', \
               'top', 'sslvts', 'mens', 'nightware', 'lslvts', 'nightwear', 'msp', 'lsivts', 'tee', 'large', 'slv']

GenericWhiteSpace = ['l', 'pj', 's']

# uses list comprehension
BClassList = [Stripped(elem, GenericRemoveAll, GenericWhiteSpace, True) for elem in Bstyles]

BClustersSC = SlinkSC(BClassList, 0.85)
# Give separate names to each list in the ClusterSC object
BClustersOriginal = BClustersSC[0]
BClustersOriginal.sort(lambda x,y: cmp(len(y) , len(x)))
BClustersStripped = BClustersSC[1]
BClustersStripped.sort(lambda x,y: cmp(len(y) , len(x)))

BClustersOriginal[:5]

BClustersStripped[:5]

BLClustersSC = SlinkSC(BClassList, 0.95)
# Give separate names to each list in the ClusterSC object
BLClustersOriginal = BClustersSC[0]
BLClustersOriginal.sort(lambda x,y: cmp(len(y) , len(x)))
BLClustersStripped = BClustersSC[1]
BLClustersStripped.sort(lambda x,y: cmp(len(y) , len(x)))

BLClustersOriginal[:5]

# Number of clusters with threshold at 0.85
print len([elem for elem in BClustersOriginal if len(elem) > 1])

# Number of clusters with threshold at 0.95
print len([elem for elem in BLClustersOriginal if len(elem) > 1])

DF = DataFrame(pd.read_csv('ExampleData.csv'))

#Example 1
for style in BClustersOriginal[1]:
    print DF[DF.style == style]['smv'].unique()
print BClustersOriginal[1]
print BClustersStripped[1]

#Example 2
for style in BLClustersOriginal[0]:
    print DF[DF.style == style]['smv'].unique()

def maxClusters(ClassList, startThreshold, stopThreshold, step):
    
    Threshold = startThreshold
    ConsistentClusters = []
    
    while Threshold <= stopThreshold:
        ConsistentCount = 0
        TotClusters = SlinkSC(ClassList, Threshold)
        MultiClusters = [elem for elem in TotClusters[0] if len(elem) > 1]
        for elem in MultiClusters:
            MultiSMV = list(itertools.chain(*[DF[DF.style == style]['smv'].unique() for style in elem]))
            if len(set(MultiSMV)) == 1:
                ConsistentCount +=1
        ConsistentClusters.append(ConsistentCount)
        Threshold += step
    
    return ConsistentClusters

        
print maxClusters(BClassList, 0.7, 0.95, 0.025)
print maxClusters(BClassList, 0.7, 0.95, 0.025)
print maxClusters(BClassList, 0.7, 0.95, 0.025)

Optimum = []
for x in xrange(10):
    L = maxClusters(BClassList, 0.7, 0.95, 0.025)
    Ind = L.index(max(L))
    Val = 0.7 + (Ind*0.025)
    Optimum.append(Val)

Optimum

Optimum2 = []
for x in xrange(10):
    L = maxClusters(BClassList, 0.82, 0.88, 0.01)
    Ind = L.index(max(L))
    Val = 0.82 + (Ind*0.01)
    Optimum2.append(Val)

print Optimum2
print np.mean(Optimum2)

def IdentifyClusters(ClassList, Threshold, df, col1, col2):

    ClustersDict = {}
    Clusters = SlinkSC(ClassList, Threshold)
    ClustersOriginal = Clusters[0]
    ClustersCentroid = Clusters[2]
    
    IndList = [x for x in xrange(len(ClustersOriginal)) if len(ClustersOriginal[x]) > 1]
    
    MultiClusters = [ClustersOriginal[Ind] for Ind in IndList]
    MultiCentroid = [ClustersCentroid[Ind] for Ind in IndList]
    
    for cluster in xrange(len(MultiClusters)):
        MultiSMV = list(itertools.chain(*[df[df[col1] == elem][col2].unique() for elem in MultiClusters[cluster]]))
        if len(set(MultiSMV)) == 1:
               ClustersDict[MultiCentroid[cluster][1]] = MultiClusters[cluster]
    
    return ClustersDict
        

MatchedDict = IdentifyClusters(BClassList, 0.85, DF, 'style', 'smv')

len(MatchedDict)

Count = 0
for key, value in MatchedDict.iteritems():
    for elem in value:
        Count +=1
        
Count

for Key in MatchedDict.keys()[:10]:
    print 'The cluster name is ' + Key 
    print MatchedDict[Key]
    print 

ForFailed = SlinkSC(BClassList, 0.85)
Potential = [elem for elem in ForFailed[0] if len(elem) > 0]

# Print a selection of the failed clusters
for elem in Potential[:15]:
    SMVs = list(itertools.chain(*[DF[DF.style == style]['smv'].unique() for style in elem]))
    if len(set(SMVs)) != 1:
        print elem
        print SMVs
        print

def ClustersRemainders(ClassList, Threshold, df, col1, col2):
    #1
    ClustersDict = {}
    Clusters = SlinkSC(ClassList, Threshold)
    ClustersOriginal = Clusters[0]
    ClustersCentroid = Clusters[2]
    
    #2
    IndList = [x for x in xrange(len(ClustersOriginal)) if len(ClustersOriginal[x]) > 1]   
    MultiClusters = [ClustersOriginal[Ind] for Ind in IndList]
    MultiCentroid = [ClustersCentroid[Ind] for Ind in IndList]
    
    #3
    for cluster in xrange(len(MultiClusters)):
        MultiSMV = list(itertools.chain(*[df[df[col1] == elem][col2].unique() for elem in MultiClusters[cluster]]))
        if len(set(MultiSMV)) == 1:
               ClustersDict[MultiCentroid[cluster][1]] = MultiClusters[cluster]
        else:
            
            #4
            if len(MultiSMV) == len(MultiClusters[cluster]):
                for smv in list(set(MultiSMV)):
                    if MultiSMV.count(smv) >= 2:
                        BoolList = [True if elem == smv else False for elem in MultiSMV]
                        StrList = [MultiClusters[cluster][x] for x in xrange(len(BoolList)) if BoolList[x] == True]
                        StrList.sort(lambda x, y: cmp(len(x), len(y)))
                        ClustersDict[StrList[0]] = StrList
                    
    
    return ClustersDict

MatchedDict2 = ClustersRemainders(BClassList, 0.85, DF, 'style', 'smv')

len(MatchedDict2)

Count = 0
for key, value in MatchedDict2.iteritems():
    for elem in value:
        Count +=1
        
Count

StringMap = {}
for key, value in MatchedDict2.iteritems():
    for elem in value:
        StringMap[elem] = key

len(DF.style.unique())

DF['style_new'] = DF.style
for row in DF.index:
    try:
        DF['style_new'][row] = StringMap[DF.style[row]]
    except KeyError:
        pass
    

len(DF.style_new.unique())

SecondPass = list(DF.style_new.unique())

SecondClassList = [Stripped(elem, GenericRemoveAll, GenericWhiteSpace, DelWhite = True) for elem in SecondPass]

SecondClusters = SlinkSC(SecondClassList, 0.85)

# Give separate names to each list in the ClusterSC object
SecondClusOrig = BClustersSC[0]
SecondClusOrig.sort(lambda x,y: cmp(len(y) , len(x)))
SecondClusStrip = BClustersSC[1]
SecondClusStrip.sort(lambda x,y: cmp(len(y) , len(x)))

SecondMatchedDict = ClustersRemainders(SecondClassList, 0.85, DF, 'style', 'smv')

len(SecondMatchedDict)

Count = 0
for key, value in SecondMatchedDict.iteritems():
    for elem in value:
        Count +=1
        
Count

for key, value in SecondMatchedDict.iteritems():
    print key, value
    print

def ClustersRecursive(Threshold, df, col1, col2, GenericAll, GenericWhite = None, DelWhite = False):
    Styles = df[col1].unique()
    ClassList = [Stripped(style, GenericAll, GenericWhite, DelWhite) for style in Styles]
    ClustersDict = {}
    Clusters = SlinkSC(ClassList, Threshold)
    ClustersOriginal = Clusters[0]
    ClustersCentroid = Clusters[2]
    IndList = [x for x in xrange(len(ClustersOriginal)) if len(ClustersOriginal[x]) > 1]   
    MultiClusters = [ClustersOriginal[Ind] for Ind in IndList]
    MultiCentroid = [ClustersCentroid[Ind] for Ind in IndList]
    
    if len(MultiClusters) == 0:
        print 'Finished1'
        return 
    else:
        Counter = 0
        for cluster in xrange(len(MultiClusters)):
            MultiSMV = list(itertools.chain(*[df[df[col1] == elem][col2].unique() for elem in MultiClusters[cluster]]))
            if len(set(MultiSMV)) == 1:
                ClustersDict[MultiCentroid[cluster][1]] = MultiClusters[cluster]
                Counter +=1
            else:
                if len(MultiSMV) == len(MultiClusters[cluster]):
                    for smv in list(set(MultiSMV)):
                        if MultiSMV.count(smv) >= 2:
                            BoolList = [True if elem == smv else False for elem in MultiSMV]
                            StrList = [MultiClusters[cluster][x] for x in xrange(len(BoolList)) if BoolList[x] == True]
                            StrList.sort(lambda x, y: cmp(len(x), len(y)))
                            ClustersDict[StrList[0]] = StrList
                            Counter +=1
    
    StringMap = {}
    for key, value in ClustersDict.iteritems():
        for elem in value:
            StringMap[elem] = key
                
    
    if Counter == 0:
        return 
    else:
        for row in DF.index:
            try:
                df[col1][row] = StringMap[df[col1][row]]
            except KeyError:
                pass
        

        ClustersRecursive(Threshold, df, col1, col2, GenericAll, GenericWhite = None, DelWhite = False) 

DF['style_new2'] = DF.style

ClustersRecursive(0.85, DF, 'style_new2', 'smv', GenericRemoveAll, GenericWhiteSpace, True)

len(DF.style_new2.unique())

LenList = []
for x in xrange(20):
    DF['style_new2'] = DF.style
    ClustersRecursive(0.85, DF, 'style_new2', 'smv', GenericRemoveAll, GenericWhiteSpace, True)
    LenList.append(len(DF.style_new2.unique()))
    

print LenList