import json
import gensim
import matplotlib.pyplot as plt
import numpy as np
from pattern.en import parse
from pattern.en import tag as ptag
import string
import tsne # local copy in the repo
%matplotlib inline
sample_parsed = parse("He is a text sample showing you what happens when you use this function for her.",
tokenize = True, # Split punctuation marks from words?
tags = True, # Parse part-of-speech tags? (NN, JJ, ...)
chunks = False, # Parse chunks? (NP, VP, PNP, ...)
relations = False, # Parse chunk relations? (-SBJ, -OBJ, ...)
lemmata = False, # Parse lemmata? (ate => eat)
encoding = 'utf-8', # Input string encoding.
tagset = None)
# just showing you what it looks like
sample_parsed
u'He/PRP is/VBZ a/DT text/NN sample/NN showing/VBG you/PRP what/WP happens/VBZ when/WRB you/PRP use/VB this/DT function/NN for/IN her/PRP ./.'
""" A bunch of functions that do text cleanup and create the strings of nouns for the model training."""
def tag_by_line(line, tags):
""" Look for only the words with those tags """
words = []
#print ptag(line)
for word, foundtag in ptag(line):
if foundtag in tags:
words.append(word)
return words
def cleanup(text):
""" This function cleans up the text a bit, and removes newlines. """
data = text.decode('utf8')
data = data.replace('\r', '') # windows newline in some files
data = data.replace('\n', ' ') # turn newline into space
data = data.replace('_', '')
data = data.replace('--', ' -- ')
data = data.replace(',', ' ')
data = data.replace('.', ' ')
data = data.strip() # strip remaining white space chars from edges
return data
def cleanup_and_pos(text, pos=['NN']):
"""
Run the cleanup function, convert to a list of tag types, clean out misc stuff left.
Also converts back to a space sep string of the text we want in the model/output only
"""
# put your own list of strings here, based on reading outputs:
other_things_to_strip = ['s', '\'s', 'n\'t', '\'ll', 'chapter', 'CHAPTER']
data = cleanup(text)
words = tag_by_line(data, pos) # returns a list of words of that POS (part of speech)
cleantext = [word.lower() for word in words if word not in string.punctuation]
cleantext = [word for word in cleantext if word not in other_things_to_strip]
newdata = ' '.join(cleantext) # make it a single string, the model likes that
return newdata
"""
Model text generator - based on Radim's sample code for gensim.
This produces a line at a time of cleaned, tag-only text for the model to train from.
"""
import os
import re
class MySentences(object):
def __init__(self, dirname, tags=['NN']):
self.dirname = dirname
self.tags = tags
def __iter__(self):
for fname in os.listdir(self.dirname):
for line in open(os.path.join(self.dirname, fname)):
yield (cleanup_and_pos(line, pos=self.tags)).split()
""" An alternate from gensim tutorials - just use all words in the model. """
class DirOfPlainTextCorpus(object):
"""Iterate over sentences of all plaintext files in a directory """
SPLIT_SENTENCES = re.compile(u"[.!?:]\s+") # split sentences on these characters
def __init__(self, dirname):
self.dirname = dirname
def __iter__(self):
for fn in os.listdir(self.dirname):
text = open(os.path.join(self.dirname, fn)).read()
for sentence in self.SPLIT_SENTENCES.split(text):
yield gensim.utils.simple_preprocess(sentence, deacc=True)
### updates datadict, uses model, tags, writes labeled txt file out
def clean_dict_keys(datadict):
""" Removes words that make bad keys/nouns for the word swaps. """
for key in datadict.keys():
if key == 'xi':
datadict.pop(key, None)
if ',' in key:
datadict.pop(key, None)
continue
if '--' in key:
datadict.pop(key, None)
continue
if 'xx' in key:
datadict.pop(key, None)
continue
if 'xv' in key:
datadict.pop(key, None)
continue
if '\n' in key:
datadict.pop(key, None)
continue
if '.' in key:
datadict.pop(key, None)
continue
return datadict
def cap_me(word, capflag):
""" Capitalizes the word if it was in the original text """
if capflag:
return word.capitalize()
else:
return word
def build_dict_write_file(model, outputtextfile, outputdatafile, tags=['NN']):
""" Writes out the labeled text with word swaps, returns datadict """
screen_outs = ['CHAPTER', 'chapter', 'Chapter', '', ',', 'xi'] # too common and not useful
datadict = {}
print tags
with open('books/austen_pride.txt', 'r') as handle:
lines = handle.read()
lines = lines.split('\n\r')
full_text = ''
for sample in lines:
new = sample.replace('\n\r',' ')
new = new.replace('_', '')
new = new.replace(',', ', ')
new = new.replace('--', ' -- ')
line = parse(new,
tokenize = True, # Split punctuation marks from words?
tags = True, # Parse part-of-speech tags? (NN, JJ, ...)
chunks = False, # Parse chunks? (NP, VP, PNP, ...)
relations = False, # Parse chunk relations? (-SBJ, -OBJ, ...)
lemmata = False, # Parse lemmata? (ate => eat)
encoding = 'utf-8', # Input string encoding.
tagset = None)
line = line.replace('\n', ' ')
sample_spaced = line.split(' ')
line_text = '<p>'
for word in sample_spaced:
capFlag = False
newWord = word
word = word.replace('\n', ' ')
if word.strip() == '':
continue
bits = word.split('/')
if len(bits) == 2:
if bits[1] in tags and bits[0] not in screen_outs:
if bits[0][0] in "ABCDEFGHIJKLMNOPQRSTUVWXYZ": # check for first cap
capFlag = True
try:
# best match for original word
best = model.most_similar(positive=[bits[0].lower()], negative=[], topn=1) # (word, score)
if best[0][0].strip() in screen_outs:
raise 'WordError'
# add original word to datadict, with best match as word for it
datadict[bits[0].lower().encode('utf8')] = {"word": best[0][0].lower(),
"score": "%.3f" % best[0][1]}
# add the reverse word to the same dict, for graphing in front end
best_reverse = model.most_similar(positive=[best[0][0]], negative=[], topn=1)
if best_reverse[0][0] in screen_outs:
raise 'WordError'
datadict[best[0][0].lower().encode('utf8')] = {"word": best_reverse[0][0].lower(),
"score": "%.3f" % best_reverse[0][1]}
newWord = '<span class="color" title="Was: <span class=\'purple\'>'\
+ bits[0] + '</span> / Word2Vec similarity: ' + str(round(best[0][1],3))\
+ '" formerWord="' + bits[0].lower() + '" newWord="' + best[0][0].lower() + '" score="'\
+ str(round(best[0][1],3)) + '">' + cap_me(best[0][0],capFlag) + '</span>'
line_text += newWord
except Exception, e: # on model errors etc, just continue past it
print e
line_text += word.replace('/NN', '')
pass
else:
if bits[0] in [',',':','.','"',';','?', '!', '\'s', '\'ll', 'n\'t']:
line_text = line_text.strip() # remove spaces before these
line_text += bits[0]
if line_text[-1] != '"': # no space after quotes; hacky.
line_text += ' '
else:
if word.strip() != '':
line_text += word.strip()
full_text += line_text + '\n'
print "vocab before clean", datadict.keys()
datadict = clean_dict_keys(datadict)
print "vocab after clean", datadict.keys()
with open(outputtextfile, 'w') as handle:
handle.write(full_text.encode('utf8'))
jsonfile = json.dumps(datadict)
with open(outputdatafile, 'w') as handle:
handle.write(jsonfile)
print "Wrote out to data directory:", outputtextfile, outputdatafile
return datadict
# The wrapper function that runs everything, writes files, and returns useful things.
def run_all(tags=['NN'], filepref='data/pride_NN'):
# notice here i require at least 2 uses of each word in the corpus - reduces noise
# uncomment the version of the model you want to try: tag limited, or all words.
#model = gensim.models.Word2Vec(MySentences('books', tags=tags), min_count=2, size=200, workers=2)
model = gensim.models.Word2Vec(DirOfPlainTextCorpus('books'), min_count=3, workers=2)
model.save(filepref + '_model_austen_allwordsmin3')
# if you want to save time and use a saved file, comment out the above and uncomment this with right path
#model = gensim.models.Word2Vec.load('data/pride_NNPRP_model_austen_all')
# does the text tagging and word replacement
print tags
datadict = build_dict_write_file(model, filepref + '_labeled.txt', filepref + '_data.json', tags=tags)
# tsne input files part
make_score_files(model, datadict, filepref)
# the actual tsne graph bit
X = np.loadtxt(filepref + "_scores.csv")
labels = np.genfromtxt(filepref + "_words.csv", dtype=str)
Y = tsne.tsne(X, 2, 50, 20.0) # see tsne.py in repo
do_tsne_files(filepref + '_coords.tsv', Y, labels, datadict, axis_off=True)
return Y, labels, datadict, model
# run it all together and get the useful outputs - this takes a while and prints a lot of stuff.
Y, labels, datadict, model = run_all(tags=['NN'], filepref='data/pride_NNAll3')
!split -l 200 data/pride_NNAll3_labeled.txt part
%%bash
mv partaa data/parts/part1.txt
mv partab data/parts/part2.txt
mv partac data/parts/part3.txt
mv partad data/parts/part4.txt
mv partae data/parts/part5.txt
mv partaf data/parts/part6.txt
mv partag data/parts/part7.txt
mv partah data/parts/part8.txt
mv partai data/parts/part9.txt
mv partaj data/parts/part10.txt
mv partak data/parts/part11.txt
mv partal data/parts/part12.txt
ls data/parts
part1.txt part11.txt part2.txt part4.txt part6.txt part8.txt part10.txt part12.txt part3.txt part5.txt part7.txt part9.txt
model.most_similar(['husband'])
[(u'nerves', 0.8918779492378235), (u'lifting', 0.7963227033615112), (u'wishes', 0.7679949998855591), (u'nephew', 0.7674976587295532), (u'senses', 0.7639766931533813), (u'daughter', 0.7601332664489746), (u'ladyship', 0.7527087330818176), (u'daughters', 0.7525165677070618), (u'thoughts', 0.7426179647445679), (u'mother', 0.7310776710510254)]
model.most_similar(['wife'])
[(u'son', 0.7893723249435425), (u'reviving', 0.7113327980041504), (u'daughter', 0.7054953575134277), (u'admittance', 0.6823280453681946), (u'attentions', 0.658092737197876), (u'warmed', 0.6542254090309143), (u'niece', 0.6514275074005127), (u'addresses', 0.6490938663482666), (u'proposals', 0.647223174571991), (u'behaviour', 0.6413060426712036)]
# for tsne - write out the scores and labels as 2 row syncd files - can be used in R now.
def make_score_files(model, datadict, filelabel):
with open(filelabel + '_scores.csv', 'w') as scorefile:
with open(filelabel + '_words.csv', 'w') as wordfile:
for word in datadict.keys():
try:
score = model[word.lower()]
scores = [str(x) for x in score]
scorefile.write('\t'.join(scores) + '\n')
wordfile.write(word.lower() + '\n')
except:
print "Not found:", word, datadict[word]
continue
scorefile.write('\n')
wordfile.write('\n')
print 'Wrote ', scorefile, wordfile
!paste data/pride_NN_words.csv data/pride_NN_scores.csv > data_for_r.tsv
# this is a function you can call independently with output from your run-all run, to jigger with details if you want.
def do_tsne_files(coordsfilename, Y, labels, datadict, axis_off=True):
""" Makes the tsne grah file and writes out the coords for the UI. """
plt.figure(figsize=(15, 15))
if axis_off:
plt.axis('off') # only turn off after checking the axis min/max
plt.scatter(Y[:,0], Y[:,1], s=10, color='gray', alpha=0.2)
print "X and Y dims ", plt.axis()
with open(coordsfilename, 'w') as handle1:
handle1.write('word\tx\ty\tnearest\n')
for label, x, y in zip(labels, Y[:, 0], Y[:, 1]):
score = datadict[label]['score']
handle1.write('\t'.join([label, str(x), str(y), str(score)]) + '\n')
#plt.text(x, y, label, size=8, alpha=0.5)
print "wrote out", coordsfilename
do_tsne_files('misc.tsv', Y, labels, datadict, axis_off=False) # run this again with axis on, to check axes limits for the graph...
X and Y dims (-150.0, 150.0, -150.0, 150.0) wrote out misc.tsv