cmd
¶parallel
¶text
¶workflow
¶modeling
¶import os
import pandas as pd
#all you realy need to know is that CABLES is the directory where the data (or cables)
#are stored on your machine
DATA = os.environ['DATA']
CABLES = os.path.join(DATA, 'declass', 'cables_short')
RAW = os.path.join(CABLES, 'raw')
PROCESSED = os.path.join(CABLES, 'processed')
SPARSE = os.path.join(CABLES, 'sparse')
sfile_path = os.path.join(SPARSE, 'cables-short.vw')
filtered_sfile_path = os.path.join(PROCESSED, 'cables-short-filtered.vw')
sff_path = os.path.join(PROCESSED, 'sff.pkl')
#filefilter is a module which helps with basic file/dir functions, such as
#retrieving all paths from a given directory and it's subdir's
from rosetta.text import filefilter
def simple_file_streamer(base_path):
paths = filefilter.get_paths(base_path, get_iter=True)
for p in paths:
with open(p) as f:
text = f.read()
yield(text)
def my_iter(N):
i=0
while True:
if i == N:
raise StopIteration
else:
yield i
i += 1
mi = my_iter(5)
mi.next()
0
#note the raised StopIteration; lets see how a for look handles this
for i in my_iter(5):
print i
0 1 2 3 4
simple_stream = simple_file_streamer(RAW)
#lets look at what this object is
type(simple_stream)
generator
#lets see what the .next() yields (and splitlines to make it more readable)
simple_stream.next().splitlines()
--------------------------------------------------------------------------- StopIteration Traceback (most recent call last) <ipython-input-20-a9c0145312d7> in <module>() 1 #lets see what the .next() yields (and splitlines to make it more readable) ----> 2 simple_stream.next().splitlines() StopIteration:
from rosetta import TextFileStreamer, TokenizerBasic
text_streamer = TextFileStreamer(text_base_path=RAW, file_type='*',
tokenizer=TokenizerBasic())
from rosetta.text import streamers
stream = text_streamer.info_stream()
stream.next()
{'atime': 1393622776.0, 'cached_path': '/Users/danielkrasner/DATA_master/prod/declass/cables_short/raw/1976ZANZIB00097', 'doc_id': '1976ZANZIB00097', 'mtime': 1383854248.0, 'size': 118, 'text': 'MRN: 1976ZANZIB000097 SEGMENT NUMBER: 000001 EXPAND ERROR ENCOUNTERED;\nTELEGRAM TEXT FOR THIS SEGMENT IS UNAVAILABLE', 'tokens': ['mrn', 'segment', 'number', 'expand', 'error', 'encountered', 'telegram', 'text', 'segment', 'unavailable']}
#lets take a quick look at TextFileStreamer
TextFileStreamer?
text = stream.next()['text']
print text
LIMITED OFFICIAL USE PAGE 01 YAOUND 00757 010739Z 17 ACTION OES-05 INFO OCT-01 AF-06 ISO-00 EB-07 AID-05 /024 W --------------------- 068311 R 010645Z MAR 76 FM AMEMBASSY YAOUNDE TO SECSTATE WASHDC 7910 LIMITED OFFICIAL USE YAOUNDE 0757 E.O.: 11652: N/A TAGS: SPOP SUBJ: IMPLICATION OF WORLDWIDE POPULATION GROWTH FOR UNITED STATES SECURITY AND OVERSEAS INTERESTS REFS: (A) STATE 301427 (B) STATE 297241 COMMENTS BELOW ARE KEYED TO LETTERED SECTIONS PARA 4 REF (A): A. CAMEROON'S BASIC POPULATION POLICY: CAMEROON HAS NO CLEARLY STATED POPULATION POLICY. CAMEROON'S PRESENT PRACTICE REFLECTS THE INFLUENCE OF OLD FRENCH LAW, TRADITIONAL DESIRE FOR LARGE FAMILIES, AND INCREASING AWARENESS BY YOUNGER CAMEROONIANS OF NEED FOR FAMILY PLANNING AND TO A CERTAIN EXTENT, BIRTH CONTROL. THERE IS NO CLEAR LEGAL PROHIBITION OF CONTRACEPTIVE PRACTICES, YET REFERENCES TO EARLIER FRENCH LAWS LEAVE LOCAL LAWYERS IN AGREEMENT THAT CAMEROON HAS NEITHER A PRO-FAMILY PLANNING POLICY NOR A LEGISLATIVE ENVIRONMENT CONDUCIVE TO THE ESTABLISHMENT OF FAMILY PLANNING SERVICES. NEVERTHELESS, CAMEROON IS NOT PRO- NATALIST, AND LEADING GOVERNMENT OFFICIALS ARE INVOLVED IN FAMILY PLANNING TRAINING AND THE DEVELOPMENT OF PILOT FAMILY PLANNING SERVICE PROGRAMS. THE GOVERNMENT OF CAMEROON (GURC) IS NOW IN THE FINAL STATES OF DRAFTING THE FOURTH FIVE-YEAR PLAN. AT EARLIER STAGES IN THE DRAFTING, IT WAS DECIDED TO CREATE AN INTERMINISTERIAL COMMISSION TO DEAL WITH POPULATION ISSUES. THE FOURTH FIVE-YEAR PLAN, WHEN IT EMERGES IN FINAL, WILL REVEAL THE DEGREE TO WHICH CAMEROON HAS BEEN ABLE TO DEFINE A POPULATION POLICY. LIMITED OFFICIAL USE LIMITED OFFICIAL USE PAGE 02 YAOUND 00757 010739Z B. CAMEROON'S POPULATION PROGRAM: DEFINED AS SUCH, CAMEROON DOES NOT HAVE A POPULATION PROGRAM. HOWEVER, THERE ARE A NUMBER OF CURRENT PROJECTS WHICH REFLECT THE PERMISSIVE STANCE OF THE GOV- ERNMENT TOWARD FAMILY PLANNING ACTIVITIES. AID IS SUPPORTING A NUMBER OF THESE ACTIVITIES WHICH RELATE TO FERTILITY DECLINE: THESE INCLUDE THE TRAINING OF MIDWIVES IN FAMILY PLANNING SER- VICE DEVELOPMENT AND ADMINISTRATION; ASSISTING WITH THE FIRST NATIONAL CENSUS; SUPPORTING FAMILY PLANNING CLINICS AT URBAN HEALTH CENTERS AND THE NATIONAL HOSPITAL (CUSS); AND THE DEVELOP- MENT OF EDUCATIONAL MATERIALS AIMED AT INTEGRATING FAMILY PLAN- NING INTO FAMILY HEALTH TRAINING PROGRAMS. THESE ACTIVITIES ARE NOT COSTLY AND CONTRIBUTE SIGNIFICANTLY TO THE PROCESS WHICH WILL EVENTUALLY REQUIRE GURC TO DEVELOP A POPULATION POLICY FAVORING THE AVAILABILITY OF FAMILY PLANNING SERVICES. CONTINUED SUPPORT IS JUSTIFIED IN ORDER TO SUSTAIN THESE FORCES UNTIL A FAVORABLE GOVERNMENT POLICY OPENS THE DOOR FOR MUCH STRONGER SUPPORT. C. POPULATION GROWTH IN CAMEROON IS AT AN ESTIMATED RATE OF 2.2 PERCENT WHICH IN URBAN AREAS IS AS HIGH AS 8 PERCENT BECAUSE OF MIGRATION. THE INFLUENCE OF THIS GROWTH UPON NATIONAL DEVELOP- MENT HAS TO DATE BEEN ONLY MARGINAL AT THE NATIONAL LEVEL. EX- PENDITURES FOR SOCIAL SERVICES ARE STEADILY INCREASING, BUT THUS FAR THERE HAVE BEEN NO SIGNIFICANT EFFORTS ON FOOD IMPORTS, DOMESTIC SAVINGS AND THE BALANCE OF PAYMENTS. D. SOCIO-ECONOMIC DEVELOPMENT, ON THE OTHER HAND, HAS FELT THE PRESSURE OF POPULATION GROWTH, PARTICULARLY IN THE RAPIDLY EX- PANDING CITIES. UNEMPLOYMENT LEVELS ARE HIGH, THE PRICE OF FOOD IN THE CITIES IS SOARING, SCHOOLS ARE UNABLE TO ABSORB THE YOUNG, AND MINOR THEFT IS COMMONPLACE. TRADITIONAL FAMILY TIES ARE NOT MAINTAINED BECAUSE OF THE INABILITY TO AFFORD THE EXCHANGE OF GIFTS. E. THE URBAN ENVIRONMENT ALSO REFLECTS THE OVERCROWDED CONDI- TIONS AS WASTE REMOVAL CANNOT KEEP PACE WITH THE NEED. IN NORTH CAMEROON, WHICH HAS SOME OF THE MOST DENSELY POPULATED AREAS OF THE COUNTRY, THE COMBINATION OF OVERGRAZING AND THE DROUGHT HAS LED TO A DEGENERATION OF LAND AND WATER RESOURCES TO A POINT WHERE IT CAN BE SEEN THAT, IF THIS PROCESS IS NOT REVERSED, NORTH CAMEROON WILL NOT BE ABLE TO SUPPORT ITS PEOPLE AFTER TEN TO LIMITED OFFICIAL USE LIMITED OFFICIAL USE PAGE 03 YAOUND 00757 010739Z FIFTEEN YEARS. F. THE CURRENT POLITICAL CLIMATE IN CAMEROON IS STABLE AND NOT LIKELY TO BE THREATENED BY POPULATION PRESSURE IN THE FORESEEABLE FUTURE. UNEMPLOYMENT AND INFLATION HAVE RESULTED IN INCREASED WORKER DEMANDS AND PETTY CRIME BUT THIS IS NOT LIKELY TO INVOLVE OTHER CENTRAL AFRICAN COUNTRIES. PRESENT GURC THINKING ON POPU- LATION PROBLEMS EMPHASIZES RESETTLEMENT IN NEW DEVELOPMENT AREAS RATHER THAN LIMITATION OF GROWTH TO RELIEVE PRESSURE. G. THE UNITED STATES CAN BEST CONTRIBUTE TO THE EVOLUATION OF A FAVORABLE POPULATION POLICY IN CAMEROON BY CONTINUING TO OFFER A VARIETY OF TYPES OF ASSISTANCE THROUGH A COMBINATION OF GOVERN- MENTAL AND PRIVATE ORGANIZATIONS. THIS APPROACH PERMITS THE APPLI- CATION OF RESOURCES TO SMALL EFFORTS WHERE THE GREATEST POLITI- CAL IMPACT CAN BE EXPECTED, WHILE RESERVING LARGE-SCALE SUPPORT UNTIL SUCH TIME AS THERE IS AN OFFICIAL EFFORT TO DEVELOP NAT- IONAL FAMILY PLANNING SERVICES. SPIRO LIMITED OFFICIAL USE NNN
text_streamer.tokenizer.text_to_token_list(text)
#lets look at a few methods
token_stream = text_streamer.token_stream() # returns a generator function which yields a stream of tokens
token_stream.next() # this is what our basic tokenizer returns (we are skipping stop words and numerics by default)
['unclassified', 'page', 'zagreb', 'action', 'eur', 'info', 'oct', 'iso', 'eure', 'eb', 'faa', 'dote', 'pm', 'nsc', 'sp', 'ss', 'ciae', 'dode', 'inr', 'nsae', 'pa', 'usia', 'prs', 'mct', 'sep', 'fm', 'amconsul', 'zagreb', 'secstate', 'washdc', 'priority', 'info', 'amembassy', 'belgrade', 'unclas', 'section', 'zagreb', 'eo', 'na', 'tags', 'pfor', 'eair', 'yo', 'subj', 'twa', 'hijacking', 'zagreb', 'press', 'reaction', 'ref', 'belgrade', 'both', 'zagreb', 'sunday', 'papers', 'vjesnik', 'borba', 'featured', 'series', 'articles', 'twa', 'hijacking', 'borba', 'carried', 'tanjujg', 'commentary', 'same', 'editorial', 'appeared', 'belgrade', 'edition', 'reftel', 'vjesnik', 'carried', 'articles', 'paris', 'new', 'york', 'correspondents', 'article', 'datelined', 'zagreb', 'summarizes', 'tanjug', 'report', 'vjesnik', 'summary', 'tanjug', 'report', 'strongly', 'criticizes', 'tolerating', 'activities', 'anti', 'yugoslav', 'extremist', 'groups', 'article', 'new', 'york', 'correspondent', 'drazen', 'vukov', 'colic', 'contains', 'different', 'slants', 'hijacking', 'example', 'article', 'interprets', 'hijacking', 'act', 'desperation', 'croatian', 'emigre', 'groups', 'finding', 'less', 'less', 'support', 'anti', 'yugoslav', 'activities', 'united', 'states', 'coming', 'under', 'quote', 'more', 'determined', 'unquote', 'investigation', 'police', 'complete', 'text', 'latter', 'article', 'follows', 'quote', 'unclassified', 'unclassified', 'page', 'zagreb', 'those', 'insane', 'people', 'mayor', 'new', 'york', 'city', 'abraham', 'beame', 'those', 'lunatics', 'threaten', 'many', 'lives', 'action', 'know', 'achieve', 'anything', 'hope', 'murderers', 'brought', 'justice', 'one', 'first', 'reactions', 'news', 'self', 'appointed', 'fighters', 'freedom', 'croatia', 'hijacked', 'airplane', 'american', 'company', 'twa', 'friday', 'evening', 'central', 'european', 'time', 'saturday', 'morning', 'local', 'time', 'new', 'york', 'airport', 'laguardia', 'starting', 'drop', 'leaflets', 'european', 'capitals', 'during', 'world', 'war', 'ii', 'explains', 'new', 'york', 'post', 'readers', 'ustashi', 'fight', "tito's", 'partisans', 'communists', 'killed', 'eight', 'hundred', 'thousand', 'jews', 'serbs', 'gypsies', 'together', 'croats', 'opponents', 'regime', 'newspaper', 'puppet', 'italian', 'german', 'hands.', 'thus', 'address', 'terrorists', 'put', 'end', 'long', 'proclamation', 'number', 'post', 'office', 'box', 'sharp', 'condemnations', 'descriptions', 'political', 'background', 'hijackers', 'immediately', 'started', 'arrive', 'hoping', 'praise', 'really', 'difficult', 'night', 'terror', 'new', 'york', 'post', 'saturday', 'morning', 'american', 'newspapers', 'radio', 'television', 'stations', 'agreed', 'question', 'propaganda', 'action', 'extremists', 'one', 'group', 'passengers', 'released', 'hijacked', 'plane', 'canada', 'spoke', 'propaganda', 'subjected', 'time.', 'bloody', 'introduction', 'leaflets', 'very', 'seriously', 'pompously', 'speak', 'unbearable', 'cultural', 'political', 'economic', 'exploitation', 'croatia', 'bloody', 'introduction', 'prior', 'hijacking', 'bomb', 'placed', 'one', 'central', 'stations', 'underground', 'railway', 'new', 'york', 'city', 'members', 'new', 'york', 'city', 'police', 'bomb', 'disposal', 'squad', 'unfortunately', 'succeed', 'mastering', 'monstrous', 'device', 'friday', 'evening', 'exploded', 'hands', 'four', 'policemen', 'brian', "o'murray", 'killed', 'spot', 'sergeant', 'terrence', 'mctigue', 'fighting', 'life', 'henry', 'dworkin', 'fritz', "o'behr", 'hospital', 'seriously', 'wounded', 'mayor', 'beame', 'duty', 'unclassified', 'unclassified', 'page', 'zagreb', 'whole', 'night', 'hospital', 'during', 'operation', 'wounded', 'policemen', 'chicago', 'airport', 'families', 'crying', 'american', 'citizens', 'cross', 'ocean', 'airplane', 'technically', 'equipped', 'trip', 'name', 'appeal', 'american', 'people', 'fighters', 'free', 'croatia.', 'unclassified', 'nnn', 'unclassified', 'page', 'zagreb', 'action', 'eur', 'info', 'oct', 'iso', 'eure', 'eb', 'faa', 'dote', 'pm', 'nsc', 'sp', 'ss', 'ciae', 'dode', 'inr', 'nsae', 'pa', 'usia', 'prs', 'mct', 'sep', 'fm', 'amconsul', 'zagreb', 'secstate', 'washdc', 'priority', 'info', 'amembassy', 'belgrade', 'unclas', 'section', 'zagreb', 'authors', 'proclamation', 'struggling', 'against', 'shadows', 'hitler', 'mussolini', 'renunciation', 'past', 'admit', 'even', 'justified', 'use', 'force', 'cause', 'fear', 'discontent', 'one', 'part', 'population', 'promise', 'use', 'force', 'little', 'possible', 'nevertheless', 'hijacked', 'airplane', 'planted', 'bomb', 'killed', 'one', 'person', 'announced', 'second', 'bomb', 'explode', 'time', 'indignation', 'such', 'liberation', 'policy', 'support', 'american', 'citizens', 'american', 'government', 'death', 'injuries', 'members', 'bomb', 'disposal', 'squad', 'aroused', 'anger', 'citizens', 'new', 'york', 'still', 'forgotten', 'unexplained', 'explosion', 'laguardia', 'airport', 'now', 'again', 'listen', 'very', 'same', 'airport', 'another', 'hijacking', 'passenger', 'plane', 'happened', 'full', 'months', 'peace', 'entire', 'usa', 'usa', 'even', 'worst', 'gangsters', 'know', 'murder', 'policeman', 'unpardonable', 'crime', 'hijacking', 'airplane', 'act', 'detested', 'united', 'states', 'terrorists', 'decide', 'commit', 'two', 'unpopular', 'types', 'crime', 'reply', 'given', 'blackmailing', 'memorandum', 'simply', 'based', 'fact', 'unclassified', 'unclassified', 'page', 'zagreb', 'american', 'public', 'less', 'less', 'support', 'anti', 'yugoslav', 'disruptive', 'ideas', 'awaken', 'conscience', 'recently', 'state', 'department', 'declared', 'future', 'continue', 'support', 'respect', 'integrity', 'unity', 'yugoslavia', 'fighters', 'freedom', 'croatia', 'denouncing', 'besides', 'fighting', 'against', 'economic', 'coopera', 'tion', 'between', 'yugoslavia', 'usa', 'disruptive', 'efforts', 'less', 'favorably', 'judged', 'american', 'police', 'more', 'determined', 'investigating', 'real', 'content', 'work', 'emigre', 'associations', 'congressmen', 'opposing', 'future', 'protection', 'former', 'war', 'criminals', 'now', 'live', 'undisturbed', 'usa', 'therefore', 'panic', 'began', 'reign', 'emigre', 'ranks', 'therefore', 'one', 'invent', 'something', 'really', 'spectacular', 'hijacking', 'leaflets', 'bombs', 'barter', 'large', 'american', 'newspapers', 'published', 'extensive', 'statement', 'terrorists', 'basic', 'condition', 'disarming', 'second', 'bomb', 'releasing', 'kidnapped', 'passengers', 'although', 'majority', 'newspapers', 'agree', 'completely', 'hijackers', 'dictate', 'important', 'pages', 'slightest', 'detail', 'nevertheless', 'more', 'certain', 'press', 'fulfilled', 'part', 'agreement', 'new', 'american', 'tactic', 'aims', 'negotiations', 'long', 'possible', 'sorts', 'terrorists', 'order', 'rescue', 'many', 'human', 'lives', 'possible', 'newspapers', 'consented', 'barter', 'terrorists', 'fulfill', 'promises', 'now', 'clear', 'one', 'believed', 'such', 'people', 'sake', 'truth', 'one', 'admit', 'section', 'american', 'newspapers', 'radio', 'television', 'approached', 'whole', 'story', 'purely', 'reasons', 'circulation', 'more', 'less', 'seriousness', 'trying', 'find', 'everything', 'even', 'deeper', 'political', 'reasons.', 'minority', 'whether', 'out', 'negligence', 'ignorance', 'commercialism', 'politics', 'last', 'resort', 'hijackers', 'those', 'sympathizers', 'croats', 'many', 'sometimes', 'very', 'loud', 'especially', 'stormy', 'time', 'america', 'various', 'unclassified', 'unclassified', 'page', 'zagreb', 'groups', 'try', 'grab', 'large', 'possible', 'piece', 'power', 'election', 'decision', 'making', 'such', 'battles', 'maneuvers', 'sometimes', 'use', 'anti', 'yugoslav', 'purposes', 'religion', 'nation', 'such', 'stories', 'yugoslav', 'persecution', 'jews', 'sometimes', 'broader', 'internal', 'squaring', 'accounts', 'effort', 'show', 'washington', 'amoral', 'cooperates', 'dictatorial', 'regimes', 'people', 'try', 'include', 'yugoslavia', 'sometimes', 'general', 'world', 'trends', 'usa', 'feels', 'itself', 'threatened', 'strengthening', 'non', 'aligned', 'policy', 'emigre', 'groups', 'try', 'use', 'ends', 'such', 'internal', 'political', 'international', 'factors', 'temporarily', 'slow', 'down', 'development', 'american', 'yugoslav', 'relations', 'thus', 'over', 'capitals', 'europe', 'quite', 'unusual', 'leaflets', 'fluttered', 'already', 'many', 'things', 'irretrievably', 'failed', 'unquote', 'kaiser', 'unclassified', 'nnn']
text_streamer.doc_id # returns a list of retrieved doc ids etc
#if you want to use another tokenizer it's easy
import nltk
nltk.word_tokenize(text)
text_streamer_nltk = TextFileStreamer(text_base_path=RAW, file_type='*',
tokenizer_func=nltk.word_tokenize)
stream_nltk = text_streamer_nltk.token_stream()
stream_nltk.next()
['LIMITED', 'OFFICIAL', 'USE', 'PAGE', '01', 'ZURICH', '00534', '051851Z', '53', 'ACTION', 'SY-05', 'INFO', 'OCT-01', 'EUR-12', 'ISO-00', 'SS-15', '/033', 'W', '--', '--', '--', '--', '--', '--', '--', '--', '--', '--', '-', '127954', 'O', 'R', '051810Z', 'SEP', '76', 'ZFF4', 'FM', 'AMCONSUL', 'ZURICH', 'TO', 'USMISSION', 'GENEVA', 'NIACT', 'IMMEDIATE', 'INFO', 'AMCONSUL', 'JOHANNESBURG', 'SECSTATE', 'WASHDC', '2360', 'LIMITED', 'OFFICIAL', 'USE', 'ZURICH', '534', 'DEPT', 'FOR', 'SY/CIC', 'E.', 'O.', '11652', ':', 'N/A', 'TAGS', ':', 'ASEC', 'OVIP', '(', 'KISSINGER', ',', 'MRS', 'HENRY', 'A', ')', 'SUBJ', ':', 'TRAVEL', 'OF', 'SY', 'ADVANCE', 'TEAM', '1.', 'SPECIAL', 'AGENTS', 'WOODS', 'AND', 'BALL', 'TO', 'DEPART', 'ZURICH', '9/6/76', 'ON', 'SWISSAIR', '#', '942', 'AT', '1935.', 'DUE', 'TO', 'CHANGE', 'IN', 'ITINERARY', ',', 'AGENTS', 'WILL', 'NOT', 'REPEAT', 'WILL', 'NOT', 'CONTINUE', 'ON', 'TO', 'JOHANNESBURG', 'ON', 'BRITISH', 'AIRWAYS', '#', '031.', '2.', 'AGENTS', 'WILL', 'REMAIN', 'IN', 'GENEVA', 'UNTIL', 'FURTHER', 'INSTRUCTIONS', 'FROM', 'HEADQUARTERS.', '3.', 'REQUEST', 'HOTEL', 'ACCOMMODATIONS', '(', 'LONGCHAMP', 'OR', 'EPSOM', ')', 'BE', 'MADE', 'AND', 'ASSISTANCE', 'UPON', 'ARRIVAL.', 'NELSON', 'LIMITED', 'OFFICIAL', 'USE', 'NNN']
from rosetta.text import text_processors, filefilter, streamers, vw_helpers
#create the VW format file
my_tokenizer = text_processors.TokenizerBasic()
stream = streamers.TextFileStreamer(text_base_path=RAW, tokenizer=my_tokenizer)
stream.to_vw(sfile_path, n_jobs=-1, raise_on_bad_id=False)
### somewhere here run (stick with 5 passes or so...)
# rm -f *cache
#vw --lda 20 --cache_file doc_tokens.cache --passes 5 -p prediction.dat --readable_model topics.dat --bit_precision 16 --lda_D 975 --lda_rho 0.1 --lda_alpha 1 ../sparse/cables-short.vw
#load the sparse file
formatter = text_processors.VWFormatter()
sff = text_processors.SFileFilter(formatter)
sff.load_sfile(sfile_path)
#remove "gaps" in the sequence of numbers (ids)
sff.compactify()
sff.save(PROCESSED + '/sff_basic.pkl')
Compactification done. self.bit_precision_required = 14 collisions = 0, vocab_size = 14476 All collisions resolved
sff.to_frame().sort_index(by='doc_fraction', ascending=False).head(10)
doc_freq | token_score | doc_fraction | |
---|---|---|---|
token | |||
fm | 829 | 845 | 0.849385 |
info | 829 | 1432 | 0.849385 |
page | 829 | 1325 | 0.849385 |
oct | 829 | 973 | 0.849385 |
action | 829 | 907 | 0.849385 |
iso | 828 | 852 | 0.848361 |
secstate | 827 | 854 | 0.847336 |
washdc | 824 | 907 | 0.844262 |
nnn | 819 | 830 | 0.839139 |
tags | 782 | 785 | 0.801230 |
10 rows × 3 columns
#use the LDAResults class from rosetta to convert back to readable, python friendly formats
lda = vw_helpers.LDAResults(PROCESSED + '/topics.dat',
PROCESSED + '/prediction.dat', PROCESSED + '/sff_basic.pkl')
#look at some of the words
topic_words = lda.pr_token_g_topic.loc[:,'topic_12'].order(ascending=False).index[:10]
lda.sfile_frame.loc[topic_words]
doc_freq | token_score | doc_fraction | |
---|---|---|---|
token | |||
museums | 1 | 2 | 0.001025 |
anniversary | 7 | 10 | 0.007172 |
inherent | 1 | 1 | 0.001025 |
joaquin | 1 | 1 | 0.001025 |
textile | 1 | 7 | 0.001025 |
marcel | 2 | 3 | 0.002049 |
involved. | 1 | 1 | 0.001025 |
disclosure | 2 | 3 | 0.002049 |
deportation | 2 | 3 | 0.002049 |
conakry | 3 | 14 | 0.003074 |
10 rows × 3 columns
#look at the topics themselves
lda.print_topics(10)
========== Printing top 10 tokens in every topic========== ------------------------------ Topic name: topic_00. P[topic_00] = 0.0271 topic_00 doc_freq token affords 0.020413 1 mattered 0.019496 1 principal 0.001991 20 manager 0.001038 16 shunting 0.000777 2 mac 0.000718 1 cost 0.000718 32 devaluations 0.000718 1 rooms 0.000718 1 interfiew 0.000718 2 ------------------------------ Topic name: topic_01. P[topic_01] = 0.0278 topic_01 doc_freq token thirdly 0.063519 1 rigs 0.001077 3 city 0.000728 16 phoned 0.000707 2 feeling 0.000462 8 dissuaded 0.000457 1 minorities 0.000386 4 nyc 0.000373 1 newsroundup 0.000373 1 resrep 0.000373 1 ------------------------------ Topic name: topic_02. P[topic_02] = 0.0392 topic_02 doc_freq token companying 0.074200 1 respected 0.032733 3 obtains 0.027652 1 less 0.023041 21 ailment 0.017348 1 palmer 0.016520 2 detecting 0.015943 2 nb 0.015255 1 croatian 0.013441 48 transmitting 0.013423 2 ------------------------------ Topic name: topic_03. P[topic_03] = 0.0261 topic_03 doc_freq token epstein 0.018949 1 rigs 0.000455 3 manager 0.000421 16 points 0.000277 23 slavia 0.000261 2 retired 0.000210 2 minorities 0.000200 4 views 0.000196 24 life. 0.000191 1 rene 0.000185 4 ------------------------------ Topic name: topic_04. P[topic_04] = 0.0278 topic_04 doc_freq token rhodes 0.023252 1 delivery 0.018596 10 embarrassed 0.010083 3 lazic 0.010066 2 procesd 0.006491 1 rigs 0.000395 3 heinous 0.000355 1 questions 0.000355 33 amounts 0.000355 4 fingerhut 0.000355 2 ------------------------------ Topic name: topic_05. P[topic_05] = 0.0264 topic_05 doc_freq token gratuitous 0.028985 1 rigs 0.000447 3 manager 0.000414 16 ound 0.000414 1 points 0.000272 23 yourself 0.000271 2 slavia 0.000261 2 unproductive 0.000206 2 sides 0.000206 7 unfamiliar 0.000204 1 ------------------------------ Topic name: topic_06. P[topic_06] = 0.0255 topic_06 doc_freq token rigs 0.000470 3 manager 0.000441 16 points 0.000287 23 slavia 0.000271 2 minorities 0.000206 4 life. 0.000197 1 rene 0.000190 4 disapprovingly 0.000190 1 associati 0.000190 1 claiming 0.000180 2 ------------------------------ Topic name: topic_07. P[topic_07] = 0.1819 topic_07 doc_freq token unclas 0.080968 480 exercise 0.061456 8 sites 0.055679 5 urban 0.037215 3 tbio 0.025041 1 leaned 0.020269 1 lady 0.017772 2 custom 0.016974 2 condemnations 0.014577 1 professed 0.014133 2 ------------------------------ Topic name: topic_08. P[topic_08] = 0.0393 topic_08 doc_freq token ps 0.207765 2 specifies 0.028863 1 museums 0.025596 1 thereby 0.016587 2 hurt 0.013174 2 signs 0.010361 5 sherman 0.005601 2 businessmen 0.003005 5 morning.nelson 0.002849 1 owner 0.002829 3 ------------------------------ Topic name: topic_09. P[topic_09] = 0.0256 topic_09 doc_freq token slavia 0.000695 2 rigs 0.000466 3 manager 0.000433 16 points 0.000286 23 cellucam 0.000239 1 nchama 0.000230 4 texts 0.000230 3 budimir 0.000206 1 fascist 0.000206 2 minorities 0.000204 4 ------------------------------ Topic name: topic_10. P[topic_10] = 0.0425 topic_10 doc_freq token stalins 0.021645 1 dsp 0.021625 1 pell 0.017589 1 ndweta 0.016886 1 attempts 0.016374 11 gon 0.015946 2 bern 0.015598 33 moltke 0.011906 2 lisinski 0.011713 1 humanity 0.010704 1 ------------------------------ Topic name: topic_11. P[topic_11] = 0.0301 topic_11 doc_freq token repression 0.059625 2 usun 0.023723 42 wolfango 0.022077 1 recipient 0.011955 1 english 0.008423 36 heritage 0.006332 2 ound 0.003699 1 refers 0.002346 3 rigs 0.000363 3 minorities 0.000356 4 ------------------------------ Topic name: topic_12. P[topic_12] = 0.1436 topic_12 doc_freq token museums 0.028422 1 anniversary 0.026150 7 inherent 0.025394 1 joaquin 0.016931 1 textile 0.015816 1 marcel 0.015317 2 involved. 0.014861 1 disclosure 0.014578 2 deportation 0.013256 2 conakry 0.013154 3 ------------------------------ Topic name: topic_13. P[topic_13] = 0.0260 topic_13 doc_freq token croatia. 0.010828 1 cancel 0.004762 4 rigs 0.000463 3 manager 0.000435 16 disapprovingly 0.000405 1 associati 0.000405 1 points 0.000284 23 slavia 0.000267 2 minorities 0.000203 4 life. 0.000195 1 ------------------------------ Topic name: topic_14. P[topic_14] = 0.0261 topic_14 doc_freq token racks 0.018467 1 rigs 0.000451 3 manager 0.000420 16 bradking 0.000408 1 solved 0.000306 1 academics 0.000299 2 bosio 0.000299 1 specifically 0.000299 15 points 0.000273 23 yourself 0.000272 2 ------------------------------ Topic name: topic_15. P[topic_15] = 0.0452 topic_15 doc_freq token museums 0.078717 1 kidded 0.032361 1 legality 0.019756 1 viking 0.014038 3 vest 0.012276 2 ruling 0.011657 3 vukoviceva 0.011027 2 esperer 0.008133 1 independistas 0.007940 1 traces 0.007673 1 ------------------------------ Topic name: topic_16. P[topic_16] = 0.0264 topic_16 doc_freq token plained 0.022303 1 rigs 0.000427 3 manager 0.000384 16 priority 0.000344 154 shouting 0.000344 3 city 0.000288 16 o'behr 0.000277 1 points 0.000260 23 ound 0.000253 1 slavia 0.000246 2 ------------------------------ Topic name: topic_17. P[topic_17] = 0.0330 topic_17 doc_freq token interest 0.050184 74 troops' 0.043989 1 oil 0.036823 17 tye 0.026336 1 sentiments 0.009119 1 originating 0.006924 1 parson' 0.005642 1 desertification 0.004743 1 furnace 0.001297 2 manager 0.000936 16 ------------------------------ Topic name: topic_18. P[topic_18] = 0.0372 topic_18 doc_freq token rick 0.132870 1 attendance 0.092382 16 outs 0.087129 1 rigs 0.000250 3 manager 0.000195 16 views 0.000166 24 overtaxed 0.000155 1 fidelity 0.000155 2 ound 0.000154 1 points 0.000152 23 ------------------------------ Topic name: topic_19. P[topic_19] = 0.1430 topic_19 doc_freq token negotia 0.031884 1 evenings 0.027105 1 staying 0.025209 7 proceedings 0.019902 7 cudna 0.016990 1 championship 0.016631 1 town 0.016536 17 leaned 0.014484 1 opportunities 0.012323 10 isreal 0.011789 1
##
lda.pr_topic_g_doc.T.loc[[0]].plot(kind='bar', figsize=(20,10),
title = 'First Document Topic Weights')
<matplotlib.axes.AxesSubplot at 0x10c7a6590>
#or at the average topic probabilties
import random
r = lambda: random.randint(0,255)
my_colors = ['#%02X%02X%02X' % (r(),r(),r()) for i in range(20)]
#my_colors = 'rgbkymc'
lda.pr_topic_g_doc.mean(axis=1).plot(kind='bar', figsize=(15,10), color=my_colors,
title='Average Topic Probabilities')
<matplotlib.axes.AxesSubplot at 0x10cbc85d0>