import os

import string

import csv

import nltk

import re

import pandas as pd

import networkx as nx
import matplotlib.pyplot as plt
#the next line is important for displaying the image, trying to generate a graph without it will cause the notebook to freeze
%matplotlib inline

#set the directory to where you wish to save the files generated by these scripts
directory = "NLMGDict"
if not os.path.exists(directory):
    os.makedirs(directory)

directory = "NLMGText"
if not os.path.exists(directory):
    os.makedirs(directory)

dictDir = "NLMGDict"
textDir = "NLMGText"

print('Character Dictionary Folder'+'\n')

for filename in os.listdir(dictDir):
    if filename == '.DS_Store':
        print('Found and removed .DS_Store in dictDir!')
        os.remove(dictDir+"/.DS_Store")
    else:
        print(filename)

print('\n'+'Text File Folder'+"\n")        
        
for filename in os.listdir(textDir):
    if filename == '.DS_Store':
        print('Found and removed .DS_Store in textDir!')
        os.remove(textDir+"/.DS_Store")
    else:
        print(filename)

#set textDir equal to the folder where the text(s) you wish to create networks out of are stored
textDir = "NLMGText"
textDirDict = {}
textDictCount = 0
for filename in os.listdir(textDir):
    #adds only .txt files
    if filename.find(".txt") == -1:
        continue
    textDirDict[textDictCount] = (textDir + "/" + filename)
    textDictCount = textDictCount + 1

def nerTagger(textString, characters):
    #tokenize textFile
    textTokens = nltk.word_tokenize(textString)
    #tag for parts of speech
    textPos = nltk.pos_tag(textTokens)
    #tag for PERSON, ORG, or LOC, NER tagger
    textNE = nltk.ne_chunk(textPos)
    
    #call findPeople from above to extract the PERSON entities from the chunked text
    findPeople(textNE, characters)
    
    #eliminate duplicates
    uniqueCharacters = set(characters)
    
    characters = uniqueCharacters
    
    #remove punction from Characters, this will be done with the text as well
    characterList = []
    punct = set(string.punctuation)

    for character in uniqueCharacters:
        character = "".join(ch for ch in character if ch not in punct)
        characterList.append(character)

def findPeople(tree, people):
    if type(tree) is nltk.tree.Tree and tree.label() == "PERSON":
        people.append(" ".join([word for word, pos in tree]))
    elif (type(tree) is nltk.tree.Tree) or (type(tree) is list):
        [findPeople(branch, people) for branch in tree]

import urllib.request
enDictURL = "http://www-01.sil.org/linguistics/wordlists/english/wordlist/wordsEn.txt"
enDictString = urllib.request.urlopen(enDictURL).read().decode()
enDictTokens = nltk.word_tokenize(enDictString)

alphabet = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
for ch in alphabet:
    enDictTokens.append(ch)

#get the location of the names list in the NLTK corpera
maleNamesFile = nltk.data.find("corpora/names/male.txt")
femaleNamesFile = nltk.data.find("corpora/names/female.txt")

#create a list of these file locations for iterating over
namesFileList = [maleNamesFile, femaleNamesFile]

#create an empty inclusion list that we will add our names to
inclusionList = []

#iterate over the lists, open the files and add their words as tokens to our inclusion list one at a time
for filename in namesFileList:
    f = open(filename, "r")
    nameString = f.read()
    f.close()
    nameTokens = nltk.word_tokenize(nameString)
    for token in nameTokens:
        inclusionList.append(token.lower())

#use the line below to add any other words you think may occur in character names
inclusionListAdditions = ['miss', 'mr', 'mrs', 'dr', 'doctor', 'general', 'colonel', 'mister',
                          'inspector', 'herr', 'lady', 'sir', 'duke', 'duchess', 'm', 'count', 'de',
                          'captain', 'st', 'lord', 'madam', 'madame']

#iterate over this list and add them to our inclusion list one at a time
for word in inclusionListAdditions:
    inclusionList.append(word)

def nerCleanUp(cleanCharacterList, characterList, inclusionList):
        
        #iterate over character list
        for name in characterList:
            
            #create a boolean check, if a word in the name is found to be a common word
            #this will be switched, and the word will not be included in our final characterList
            wordCheck = False
            
            #splits the name in the case that it is more than one word
            nameList = name.split()
            
            #check each word in the nameList against the word list, and the inclusion list
            #any name containing a word in the english word list and not in the inclusion list
            for token in nameList:
                for word in enDictTokens:
                    if token.lower() == word and token.lower() not in inclusionList:
                        wordCheck = True
            if wordCheck == False:
                cleanCharacterList.append(name)

def narratorAlias(textString):
    textString = textString.replace("“", '"')
    textString = textString.replace("”", '"')
    textString = textString.replace("’", "'")
    textString = textString.replace("'", "")

    patternList = [r'(\bId\b)(?=(?:[^"]|"[^"]*")*$)', r'(\bIve\b)(?=(?:[^"]|"[^"]*")*$)', r'(\bIll\b)(?=(?:[^"]|"[^"]*")*$)',
                    r'(\bI\b)(?=(?:[^"]|"[^"]*")*$)', r'(\bme\b)(?=(?:[^"]|"[^"]*")*$)', r'(\bmy\b)(?=(?:[^"]|"[^"]*")*$)']

    pattern = r'(\bIm\b)(?=(?:[^"]|"[^"]*")*$)'
    narratorAliasString = re.sub(pattern, 'NARRATOR', textString)

    for pattern in patternList:
        textString = narratorAliasString
        narratorAliasString = re.sub(pattern, 'NARRATOR', textString)
    
    return(narratorAliasString)

#test NARRATOR alias by key, change textDirDict[#] to view different texts
f = open(textDirDict[0], "r")
textString = f.read()
f.close()

textTest = narratorAlias(textString)

#first 1000 characters, change the value or remove the bracketed part to view more or all of the text
textTest[:1000]

textCount = 0

for filename in textDirDict:
    f = open(textDirDict[textCount], "r")
    textString = f.read()
    f.close()
    
    characterList = []
    
    #call the NE tagger
    nerTagger(textString, characterList)
    
    #remove duplicates
    uniqueCharacterList = set(characterList)
    
    cleanCharacterList = []
    
    #create a variable outof the inclusionList generated above
    localInclusionList = inclusionList

    #call the NE list clean up method from above
    nerCleanUp(cleanCharacterList, uniqueCharacterList, localInclusionList)
    
    #create the header for the csv files
    csvCharacterList = []
    csvCharacterList.append(['id', 'name', 'identification'])
    characterCount = 1

    #add characters from the cleanCharacterList into the list to be turned into a csv file
    for character in cleanCharacterList:
        characterEntry = [str(characterCount), character, 'True']
        csvCharacterList.append(characterEntry)
        characterCount += 1
    
    #turn the csvCharacterList into a matrix in order to write it to file
    charcterMatrix = pd.DataFrame(csvCharacterList)
    csvFile = charcterMatrix.to_csv(index=False, header=False)
    
    #from the textDirDict extract the filename and clean the beginning and end off for writing the csv filenames
    removeDir = len('NLMGText/')
    removeTxt = textDirDict[textCount].find('.txt')
    filename = textDirDict[textCount][removeDir:removeTxt]
    
    #create a new csv file and write the matrix to it
    f = open("NLMGDict/" + filename + "_Dictionary.csv", "w")
    f.write(csvFile)
    f.close()
    
    textCount += 1

#this is the same as where you have just written the file
csvDir = "NLMGDict"

textCount = 0
hitDict = {}
hitDictCount = 0

for filename in os.listdir(csvDir):
    
    #if the file is not a csv file move onto the next iteration in the loop
    #this is necessary as system generated hidden files (DSStore in OSX for example) will cause an error
    #(thanks Stefan Sinclair for this snippet) 
    if filename.find(".csv") == -1:
        continue
        
    #create temporary dictionary under the headings in the character dictionary for each csv
    d = {}
    d['character'] = []
    d['value'] = []
    d['identification'] = []
    
    #read csv into dictionary, field names match the names of the column headers
    csvDict = csv.DictReader(open((csvDir + "/" + filename), 'rt', encoding="utf-8"), 
        fieldnames = ['character', 'value', 'identification'], 
        delimiter = ',', quotechar = '"')
    
    #skip over header column (fieldnames)
    next(csvDict)
    
    #these two variables are needed for removing the punctuation in the next step
    emptyString = ""
    exclude = set(string.punctuation)
    
    #create character dictionary with id and name, strip punctuation and make lowercase
    characterDict = {}
    for row in csvDict:
        charNameString = row['value']
        charNameStringClean = emptyString.join(ch for ch in charNameString if ch not in exclude)
        characterDict[charNameStringClean.lower()] = row['character']
        
    #open text from textDirDict and read to a string
    f = open(textDirDict[textCount], "r")
    textString = f.read()
    f.close()
    
    #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    #if the text is first person include the following line to alias the first person pronoun within the text
    #be sure to comment out this line if the text has a third person narrator
    textString = narratorAlias(textString)
    
    #strip punctuation and make lowercase
    cleanTextString = emptyString.join(ch for ch in textString if ch not in exclude).lower()
    
    #split text string on paragraphs
    textParagraphs = cleanTextString.split("\n")
    textParagraphs
    
    #these counters are used for keeping track of what paragraph an instance is found in
    #this is essential for validating the results
    paraCount = 0
    hitCount = 0
    paraHits = {}
    
    #iterate over paragraphs, iterate over characterDict, match name in para and add instance to paraHits
    #this creates a temporary paragraph level dictionary, from which a list of all hits for the current paragraph is created
    for para in textParagraphs:
        for name, id in characterDict.items():
            #if name in para:
            #if (" " + name + " ") in para:
            if re.search('\\b'+name+'\\b', para) is not None or (name+"s ") in para:
                paraHits[hitCount] = [textCount, paraCount, name, id]
                hitCount = hitCount + 1
        paraCount = paraCount + 1

    #add paraHits into text level list called hitList
    hitList = []

    for hitCount, instance in paraHits.items():
        hitList.append(instance)
        
    #add hitList to hitDict indexed by number as string
    #index must be a string if you want to iterate over this dictionary, otherwise you receive the error
        #"cannot iterate over integer"
    hitDict[str(hitDictCount)] = hitList
    
    #increment counters
    textCount = textCount + 1
    hitDictCount = hitDictCount + 1

hitDict['0'][:20] #first twenty for legibility, change value or remove brackets to view more

interactionDict = {}

for key in hitDict:
    paraHitDict = {}
    for text, para, name, id in hitDict[key]:
        #check if paragraph exists as ket in paraHitDict
        if para in paraHitDict:
            #check if id already exists in the id list, if so pass, if not append
            if id in paraHitDict[para]:
                pass
            else:
                paraHitDict[para].append(id)
        else:
            #if para does not exists as a key, as it and assign the id as a list
            paraHitDict[para] = [id]
    interactionDict[key] = paraHitDict

print(interactionDict['0'])

from collections import defaultdict

textEdgesDict = {}

for key in interactionDict:
    edgesDictionary = defaultdict(int)
    for para, people in interactionDict[key].items():
        for personA in people:
            for personB in people:
                if personA < personB:
                    edgesDictionary[personA + " -- " + personB] += 1
    textEdgesDict[key] = edgesDictionary

textEdgesDict['0']

edgesFreqsDict = {}

for key in textEdgesDict:
    edgesFreqs = nltk.FreqDist(textEdgesDict[key])
    edgesFreqsDict[key] = edgesFreqs

edgesFreqs = edgesFreqsDict['0']

G = nx.Graph()
plt.figure(figsize=(10,10))

# create graph edges (node pairs) and keep track of edges for each count 
edges = defaultdict(list)
for names, count in edgesFreqs.most_common():
    if count > 0:
        parts = names.split(" -- ")
        G.add_edge(parts[0], parts[1], width=count)
        edges[count].append((parts[0], parts[1]))
    else:
        break

# draw labels (nx.draw(G) doesn't really work)
pos = nx.spring_layout(G)
nx.draw_networkx_labels(G, pos)

# draw edges with different widths
for count, edgelist in edges.items():
    g = G.subgraph(pos)
    nx.draw_networkx_edges(g, pos, edgelist=edgelist, width=count, alpha=0.1)

plt.axis('off')
plt.show()

G = nx.Graph()
plt.figure(figsize=(10,10))

# create graph edges (node pairs) and keep track of edges for each count 
edges = defaultdict(list)
for names, count in edgesFreqs.most_common():
    if count > 0:
        parts = names.split(" -- ")
        names = []
        for idNum in parts:
            for key in characterDict:
                if characterDict[key] == idNum:
                    names.append(key)
                    break
        G.add_edge(names[0], names[1], width=count)
        edges[count].append((names[0], names[1]))
    else:
        break

# draw labels (nx.draw(G) doesn't really work)
pos = nx.spring_layout(G)
nx.draw_networkx_labels(G, pos)

# draw edges with different widths
for count, edgelist in edges.items():
    g = G.subgraph(pos)
    nx.draw_networkx_edges(g, pos, edgelist=edgelist, width=count, alpha=0.1)

plt.axis('off')
plt.show()

#tokenize the text as sentences, set abbreviation parameters
punkt_param = PunktParameters()
punkt_param.abbrev_types = set(['dr', 'mr', 'mrs', 'prof', 'st', 'ft'])
tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer(punkt_param)
textSentences = tokenizer.tokenize(textString, realign_boundaries = True)

textDir = "NLMGText"
textDirDict = {}
textDictCount = 0

for filename in os.listdir(textDir):
    textDirDict[textDictCount] = (textDir + "/" + filename)
    textDictCount = textDictCount + 1
    
    directory = filename
    if not os.path.exists(directory):
        os.makedirs(directory)

    f = open("NLMGText/" + filename, "r")
    textString = f.read()
    f.close()
    
    textParagraphs = textString.split("\n")
    textParagraphs

    paraCount = 0
    for para in textParagraphs:
        f = open(filename + "/" + str(paraCount) + ".txt", "w")
        f.write(para)
        f.close()
        paraCount = paraCount + 1

hitDict['0']

interactionDict['0']

for key in edgesFreqsDict:
    print("text#: "+key+"\n")
    for edge, count in edgesFreqsDict[key].items():
        print(edge+": "+str(count))