import os import string import csv import nltk import re import pandas as pd import networkx as nx import matplotlib.pyplot as plt #the next line is important for displaying the image, trying to generate a graph without it will cause the notebook to freeze %matplotlib inline #set the directory to where you wish to save the files generated by these scripts directory = "NLMGDict" if not os.path.exists(directory): os.makedirs(directory) directory = "NLMGText" if not os.path.exists(directory): os.makedirs(directory) dictDir = "NLMGDict" textDir = "NLMGText" print('Character Dictionary Folder'+'\n') for filename in os.listdir(dictDir): if filename == '.DS_Store': print('Found and removed .DS_Store in dictDir!') os.remove(dictDir+"/.DS_Store") else: print(filename) print('\n'+'Text File Folder'+"\n") for filename in os.listdir(textDir): if filename == '.DS_Store': print('Found and removed .DS_Store in textDir!') os.remove(textDir+"/.DS_Store") else: print(filename) #set textDir equal to the folder where the text(s) you wish to create networks out of are stored textDir = "NLMGText" textDirDict = {} textDictCount = 0 for filename in os.listdir(textDir): #adds only .txt files if filename.find(".txt") == -1: continue textDirDict[textDictCount] = (textDir + "/" + filename) textDictCount = textDictCount + 1 def nerTagger(textString, characters): #tokenize textFile textTokens = nltk.word_tokenize(textString) #tag for parts of speech textPos = nltk.pos_tag(textTokens) #tag for PERSON, ORG, or LOC, NER tagger textNE = nltk.ne_chunk(textPos) #call findPeople from above to extract the PERSON entities from the chunked text findPeople(textNE, characters) #eliminate duplicates uniqueCharacters = set(characters) characters = uniqueCharacters #remove punction from Characters, this will be done with the text as well characterList = [] punct = set(string.punctuation) for character in uniqueCharacters: character = "".join(ch for ch in character if ch not in punct) characterList.append(character) def findPeople(tree, people): if type(tree) is nltk.tree.Tree and tree.label() == "PERSON": people.append(" ".join([word for word, pos in tree])) elif (type(tree) is nltk.tree.Tree) or (type(tree) is list): [findPeople(branch, people) for branch in tree] import urllib.request enDictURL = "http://www-01.sil.org/linguistics/wordlists/english/wordlist/wordsEn.txt" enDictString = urllib.request.urlopen(enDictURL).read().decode() enDictTokens = nltk.word_tokenize(enDictString) alphabet = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z'] for ch in alphabet: enDictTokens.append(ch) #get the location of the names list in the NLTK corpera maleNamesFile = nltk.data.find("corpora/names/male.txt") femaleNamesFile = nltk.data.find("corpora/names/female.txt") #create a list of these file locations for iterating over namesFileList = [maleNamesFile, femaleNamesFile] #create an empty inclusion list that we will add our names to inclusionList = [] #iterate over the lists, open the files and add their words as tokens to our inclusion list one at a time for filename in namesFileList: f = open(filename, "r") nameString = f.read() f.close() nameTokens = nltk.word_tokenize(nameString) for token in nameTokens: inclusionList.append(token.lower()) #use the line below to add any other words you think may occur in character names inclusionListAdditions = ['miss', 'mr', 'mrs', 'dr', 'doctor', 'general', 'colonel', 'mister', 'inspector', 'herr', 'lady', 'sir', 'duke', 'duchess', 'm', 'count', 'de', 'captain', 'st', 'lord', 'madam', 'madame'] #iterate over this list and add them to our inclusion list one at a time for word in inclusionListAdditions: inclusionList.append(word) def nerCleanUp(cleanCharacterList, characterList, inclusionList): #iterate over character list for name in characterList: #create a boolean check, if a word in the name is found to be a common word #this will be switched, and the word will not be included in our final characterList wordCheck = False #splits the name in the case that it is more than one word nameList = name.split() #check each word in the nameList against the word list, and the inclusion list #any name containing a word in the english word list and not in the inclusion list for token in nameList: for word in enDictTokens: if token.lower() == word and token.lower() not in inclusionList: wordCheck = True if wordCheck == False: cleanCharacterList.append(name) def narratorAlias(textString): textString = textString.replace("“", '"') textString = textString.replace("”", '"') textString = textString.replace("’", "'") textString = textString.replace("'", "") patternList = [r'(\bId\b)(?=(?:[^"]|"[^"]*")*$)', r'(\bIve\b)(?=(?:[^"]|"[^"]*")*$)', r'(\bIll\b)(?=(?:[^"]|"[^"]*")*$)', r'(\bI\b)(?=(?:[^"]|"[^"]*")*$)', r'(\bme\b)(?=(?:[^"]|"[^"]*")*$)', r'(\bmy\b)(?=(?:[^"]|"[^"]*")*$)'] pattern = r'(\bIm\b)(?=(?:[^"]|"[^"]*")*$)' narratorAliasString = re.sub(pattern, 'NARRATOR', textString) for pattern in patternList: textString = narratorAliasString narratorAliasString = re.sub(pattern, 'NARRATOR', textString) return(narratorAliasString) #test NARRATOR alias by key, change textDirDict[#] to view different texts f = open(textDirDict[0], "r") textString = f.read() f.close() textTest = narratorAlias(textString) #first 1000 characters, change the value or remove the bracketed part to view more or all of the text textTest[:1000] textCount = 0 for filename in textDirDict: f = open(textDirDict[textCount], "r") textString = f.read() f.close() characterList = [] #call the NE tagger nerTagger(textString, characterList) #remove duplicates uniqueCharacterList = set(characterList) cleanCharacterList = [] #create a variable outof the inclusionList generated above localInclusionList = inclusionList #call the NE list clean up method from above nerCleanUp(cleanCharacterList, uniqueCharacterList, localInclusionList) #create the header for the csv files csvCharacterList = [] csvCharacterList.append(['id', 'name', 'identification']) characterCount = 1 #add characters from the cleanCharacterList into the list to be turned into a csv file for character in cleanCharacterList: characterEntry = [str(characterCount), character, 'True'] csvCharacterList.append(characterEntry) characterCount += 1 #turn the csvCharacterList into a matrix in order to write it to file charcterMatrix = pd.DataFrame(csvCharacterList) csvFile = charcterMatrix.to_csv(index=False, header=False) #from the textDirDict extract the filename and clean the beginning and end off for writing the csv filenames removeDir = len('NLMGText/') removeTxt = textDirDict[textCount].find('.txt') filename = textDirDict[textCount][removeDir:removeTxt] #create a new csv file and write the matrix to it f = open("NLMGDict/" + filename + "_Dictionary.csv", "w") f.write(csvFile) f.close() textCount += 1 #this is the same as where you have just written the file csvDir = "NLMGDict" textCount = 0 hitDict = {} hitDictCount = 0 for filename in os.listdir(csvDir): #if the file is not a csv file move onto the next iteration in the loop #this is necessary as system generated hidden files (DSStore in OSX for example) will cause an error #(thanks Stefan Sinclair for this snippet) if filename.find(".csv") == -1: continue #create temporary dictionary under the headings in the character dictionary for each csv d = {} d['character'] = [] d['value'] = [] d['identification'] = [] #read csv into dictionary, field names match the names of the column headers csvDict = csv.DictReader(open((csvDir + "/" + filename), 'rt', encoding="utf-8"), fieldnames = ['character', 'value', 'identification'], delimiter = ',', quotechar = '"') #skip over header column (fieldnames) next(csvDict) #these two variables are needed for removing the punctuation in the next step emptyString = "" exclude = set(string.punctuation) #create character dictionary with id and name, strip punctuation and make lowercase characterDict = {} for row in csvDict: charNameString = row['value'] charNameStringClean = emptyString.join(ch for ch in charNameString if ch not in exclude) characterDict[charNameStringClean.lower()] = row['character'] #open text from textDirDict and read to a string f = open(textDirDict[textCount], "r") textString = f.read() f.close() #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! #if the text is first person include the following line to alias the first person pronoun within the text #be sure to comment out this line if the text has a third person narrator textString = narratorAlias(textString) #strip punctuation and make lowercase cleanTextString = emptyString.join(ch for ch in textString if ch not in exclude).lower() #split text string on paragraphs textParagraphs = cleanTextString.split("\n") textParagraphs #these counters are used for keeping track of what paragraph an instance is found in #this is essential for validating the results paraCount = 0 hitCount = 0 paraHits = {} #iterate over paragraphs, iterate over characterDict, match name in para and add instance to paraHits #this creates a temporary paragraph level dictionary, from which a list of all hits for the current paragraph is created for para in textParagraphs: for name, id in characterDict.items(): #if name in para: #if (" " + name + " ") in para: if re.search('\\b'+name+'\\b', para) is not None or (name+"s ") in para: paraHits[hitCount] = [textCount, paraCount, name, id] hitCount = hitCount + 1 paraCount = paraCount + 1 #add paraHits into text level list called hitList hitList = [] for hitCount, instance in paraHits.items(): hitList.append(instance) #add hitList to hitDict indexed by number as string #index must be a string if you want to iterate over this dictionary, otherwise you receive the error #"cannot iterate over integer" hitDict[str(hitDictCount)] = hitList #increment counters textCount = textCount + 1 hitDictCount = hitDictCount + 1 hitDict['0'][:20] #first twenty for legibility, change value or remove brackets to view more interactionDict = {} for key in hitDict: paraHitDict = {} for text, para, name, id in hitDict[key]: #check if paragraph exists as ket in paraHitDict if para in paraHitDict: #check if id already exists in the id list, if so pass, if not append if id in paraHitDict[para]: pass else: paraHitDict[para].append(id) else: #if para does not exists as a key, as it and assign the id as a list paraHitDict[para] = [id] interactionDict[key] = paraHitDict print(interactionDict['0']) from collections import defaultdict textEdgesDict = {} for key in interactionDict: edgesDictionary = defaultdict(int) for para, people in interactionDict[key].items(): for personA in people: for personB in people: if personA < personB: edgesDictionary[personA + " -- " + personB] += 1 textEdgesDict[key] = edgesDictionary textEdgesDict['0'] edgesFreqsDict = {} for key in textEdgesDict: edgesFreqs = nltk.FreqDist(textEdgesDict[key]) edgesFreqsDict[key] = edgesFreqs edgesFreqs = edgesFreqsDict['0'] G = nx.Graph() plt.figure(figsize=(10,10)) # create graph edges (node pairs) and keep track of edges for each count edges = defaultdict(list) for names, count in edgesFreqs.most_common(): if count > 0: parts = names.split(" -- ") G.add_edge(parts[0], parts[1], width=count) edges[count].append((parts[0], parts[1])) else: break # draw labels (nx.draw(G) doesn't really work) pos = nx.spring_layout(G) nx.draw_networkx_labels(G, pos) # draw edges with different widths for count, edgelist in edges.items(): g = G.subgraph(pos) nx.draw_networkx_edges(g, pos, edgelist=edgelist, width=count, alpha=0.1) plt.axis('off') plt.show() G = nx.Graph() plt.figure(figsize=(10,10)) # create graph edges (node pairs) and keep track of edges for each count edges = defaultdict(list) for names, count in edgesFreqs.most_common(): if count > 0: parts = names.split(" -- ") names = [] for idNum in parts: for key in characterDict: if characterDict[key] == idNum: names.append(key) break G.add_edge(names[0], names[1], width=count) edges[count].append((names[0], names[1])) else: break # draw labels (nx.draw(G) doesn't really work) pos = nx.spring_layout(G) nx.draw_networkx_labels(G, pos) # draw edges with different widths for count, edgelist in edges.items(): g = G.subgraph(pos) nx.draw_networkx_edges(g, pos, edgelist=edgelist, width=count, alpha=0.1) plt.axis('off') plt.show() #tokenize the text as sentences, set abbreviation parameters punkt_param = PunktParameters() punkt_param.abbrev_types = set(['dr', 'mr', 'mrs', 'prof', 'st', 'ft']) tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer(punkt_param) textSentences = tokenizer.tokenize(textString, realign_boundaries = True) textDir = "NLMGText" textDirDict = {} textDictCount = 0 for filename in os.listdir(textDir): textDirDict[textDictCount] = (textDir + "/" + filename) textDictCount = textDictCount + 1 directory = filename if not os.path.exists(directory): os.makedirs(directory) f = open("NLMGText/" + filename, "r") textString = f.read() f.close() textParagraphs = textString.split("\n") textParagraphs paraCount = 0 for para in textParagraphs: f = open(filename + "/" + str(paraCount) + ".txt", "w") f.write(para) f.close() paraCount = paraCount + 1 hitDict['0'] interactionDict['0'] for key in edgesFreqsDict: print("text#: "+key+"\n") for edge, count in edgesFreqsDict[key].items(): print(edge+": "+str(count))