cmd
¶parallel
¶text
¶workflow
¶modeling
¶import os
import pandas as pd
#all you realy need to know is that CABLES is the directory where the data (or cables)
#are stored on your machine
DATA = os.environ['DATA']
CABLES = os.path.join(DATA, 'declass', 'cables_short')
RAW = os.path.join(CABLES, 'raw')
PROCESSED = os.path.join(CABLES, 'processed')
SPARSE = os.path.join(CABLES, 'sparse')
sfile_path = os.path.join(SPARSE, 'cables-short.vw')
filtered_sfile_path = os.path.join(PROCESSED, 'cables-short-filtered.vw')
sff_path = os.path.join(PROCESSED, 'sff.pkl')
#filefilter is a module which helps with basic file/dir functions, such as
#retrieving all paths from a given directory and it's subdir's
from rosetta.text import filefilter
def simple_file_streamer(base_path):
paths = filefilter.get_paths(base_path, get_iter=True)
for p in paths:
with open(p) as f:
text = f.read()
yield(text)
def my_iter(N):
i=0
while True:
if i == N:
raise StopIteration
else:
yield i
i += 1
mi = my_iter(5)
mi.next()
#note the raised StopIteration; lets see how a for look handles this
for i in my_iter(5):
print i
simple_stream = simple_file_streamer(RAW)
#lets look at what this object is
type(simple_stream)
#lets see what the .next() yields (and splitlines to make it more readable)
simple_stream.next().splitlines()
from rosetta import TextFileStreamer, TokenizerBasic
text_streamer = TextFileStreamer(text_base_path=RAW, file_type='*',
tokenizer=TokenizerBasic())
from rosetta.text import streamers
stream = text_streamer.info_stream()
stream.next()
#lets take a quick look at TextFileStreamer
TextFileStreamer?
text = stream.next()['text']
print text
text_streamer.tokenizer.text_to_token_list(text)
#lets look at a few methods
token_stream = text_streamer.token_stream() # returns a generator function which yields a stream of tokens
token_stream.next() # this is what our basic tokenizer returns (we are skipping stop words and numerics by default)
text_streamer.doc_id # returns a list of retrieved doc ids etc
#if you want to use another tokenizer it's easy
import nltk
nltk.word_tokenize(text)
text_streamer_nltk = TextFileStreamer(text_base_path=RAW, file_type='*',
tokenizer_func=nltk.word_tokenize)
stream_nltk = text_streamer_nltk.token_stream()
stream_nltk.next()
from rosetta.text import text_processors, filefilter, streamers, vw_helpers
#create the VW format file
my_tokenizer = text_processors.TokenizerBasic()
stream = streamers.TextFileStreamer(text_base_path=RAW, tokenizer=my_tokenizer)
stream.to_vw(sfile_path, n_jobs=-1, raise_on_bad_id=False)
### somewhere here run (stick with 5 passes or so...)
# rm -f *cache
#vw --lda 20 --cache_file doc_tokens.cache --passes 5 -p prediction.dat --readable_model topics.dat --bit_precision 16 --lda_D 975 --lda_rho 0.1 --lda_alpha 1 ../sparse/cables-short.vw
#load the sparse file
formatter = text_processors.VWFormatter()
sff = text_processors.SFileFilter(formatter)
sff.load_sfile(sfile_path)
#remove "gaps" in the sequence of numbers (ids)
sff.compactify()
sff.save(PROCESSED + '/sff_basic.pkl')
sff.to_frame().sort_index(by='doc_fraction', ascending=False).head(10)
#use the LDAResults class from rosetta to convert back to readable, python friendly formats
lda = vw_helpers.LDAResults(PROCESSED + '/topics.dat',
PROCESSED + '/prediction.dat', PROCESSED + '/sff_basic.pkl')
#look at some of the words
topic_words = lda.pr_token_g_topic.loc[:,'topic_12'].order(ascending=False).index[:10]
lda.sfile_frame.loc[topic_words]
#look at the topics themselves
lda.print_topics(10)
##
lda.pr_topic_g_doc.T.loc[[0]].plot(kind='bar', figsize=(20,10),
title = 'First Document Topic Weights')
#or at the average topic probabilties
import random
r = lambda: random.randint(0,255)
my_colors = ['#%02X%02X%02X' % (r(),r(),r()) for i in range(20)]
#my_colors = 'rgbkymc'
lda.pr_topic_g_doc.mean(axis=1).plot(kind='bar', figsize=(15,10), color=my_colors,
title='Average Topic Probabilities')