%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.mpl_style', 'default') # Make the graphs a bit prettier
plt.rcParams['figure.figsize'] = (15, 5)
import os
import sys
import traceback
Reading at scale:
Martha Ballard's Diary http://dohistory.org/diary/index.html
http://www.cameronblevins.org/posts/topic-modeling-martha-ballards-diary/
Richmond Dispatch
from IPython.display import Image
Image("http://journalofdigitalhumanities.org/wp-content/uploads/2013/02/blei_lda_illustration.png")
Assuming that you are storing your data in a directory in the same place as your iPython notebook.
First, we'll need to read in plain text files.
def readtextfiles(our_directory):
"""reads in plain text files and puts them in order in a list"""
current_dir=os.getcwd() #should build up file names using os.path()
os.chdir(our_directory)
files=[file for file in os.listdir(".") if not file.startswith('.')] #defeat hidden files
files=[file for file in files if not os.path.isdir(file)==True] #defeat directories
articles=[]
for file in files:
with open(file) as plaintext: #not doing anything about encoding, could be problem
lines=plaintext.readlines()
article=" ".join(lines) #alter lines if want to skip lines
articles.append(article) #you might want to extract the file name to use; how do it?
os.chdir(current_dir)
return articles, files
our_texts, names=readtextfiles("text_examples/british-fiction-corpus")
names
['Dickens_David.txt', 'Austen_Sense.txt', 'Dickens_Bleak.txt', 'CBronte_Jane.txt', 'CBronte_Villette.txt', 'ABronte_Agnes.txt', 'Austen_Emma.txt', 'Austen_Pride.txt', 'ABronte_Tenant.txt', 'CBronte_Professor.txt', 'Dickens_Hard.txt']
def data_cleanse(docs_to_clean):
import re
D=len(docs_to_clean)
for d in range(0, D):
docs_to_clean[d] = docs_to_clean[d].lower()
docs_to_clean[d] = re.sub(r'-', ' ', docs_to_clean[d])
docs_to_clean[d] = re.sub(r'[^a-zA-Z0-9 ]', '', docs_to_clean[d])
docs_to_clean[d] = re.sub(r' +', ' ', docs_to_clean[d])
docs_to_clean[d] = re.sub(r'\s\w\s', ' ', docs_to_clean[d]) #eliminate single letters
return docs_to_clean
#You'll need to modify this as necessary!
our_texts=data_cleanse(our_texts)
#more necessary when have messy text
#eliminate escaped characters
vectorizer
from scikit learn
¶from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer=TfidfVectorizer(min_df=0.5, stop_words='english', use_idf=True)
document_term_matrix=vectorizer.fit_transform(our_texts)
# now let's get our vocabulary--the names corresponding to the rows
vocab=vectorizer.get_feature_names()
len(vocab)
7102
document_term_matrix.shape
(11, 7102)
document_term_matrix_dense=document_term_matrix.toarray()
dtmdf=pd.DataFrame(document_term_matrix_dense, columns=vocab)
dtmdf
18 | abandoned | abashed | abhorred | abhorrence | abide | abilities | ability | able | abode | ... | youd | youll | young | younger | youngest | youre | youth | youthful | youve | zeal | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.000267 | 0.002905 | 0.001869 | 0.000000 | 0.000488 | 0.000223 | 0.002235 | 0.001219 | 0.007115 | 0.001320 | ... | 0.004916 | 0.015798 | 0.049804 | 0.003124 | 0.001788 | 0.012290 | 0.005380 | 0.004309 | 0.004146 | 0.000244 |
1 | 0.000989 | 0.000827 | 0.000000 | 0.001805 | 0.003611 | 0.000000 | 0.007444 | 0.002708 | 0.029546 | 0.003491 | ... | 0.000000 | 0.000000 | 0.066158 | 0.003854 | 0.003308 | 0.000000 | 0.004496 | 0.001519 | 0.000000 | 0.001805 |
2 | 0.000000 | 0.002917 | 0.001341 | 0.000000 | 0.000245 | 0.000000 | 0.000898 | 0.000980 | 0.012721 | 0.000758 | ... | 0.003366 | 0.013598 | 0.074407 | 0.003137 | 0.001571 | 0.021317 | 0.005750 | 0.003296 | 0.004163 | 0.000000 |
3 | 0.000000 | 0.004706 | 0.000000 | 0.003210 | 0.000000 | 0.001176 | 0.000588 | 0.000000 | 0.010506 | 0.003972 | ... | 0.002353 | 0.004321 | 0.037001 | 0.006395 | 0.000588 | 0.002941 | 0.007766 | 0.001620 | 0.005778 | 0.002568 |
4 | 0.000724 | 0.001212 | 0.001448 | 0.000661 | 0.001323 | 0.000606 | 0.000000 | 0.001323 | 0.007529 | 0.002558 | ... | 0.000606 | 0.001669 | 0.053176 | 0.000941 | 0.003636 | 0.000606 | 0.012706 | 0.002782 | 0.000000 | 0.001323 |
5 | 0.000000 | 0.000000 | 0.001862 | 0.000000 | 0.001700 | 0.000000 | 0.003115 | 0.003400 | 0.025401 | 0.011833 | ... | 0.004673 | 0.010011 | 0.081041 | 0.015724 | 0.001558 | 0.017133 | 0.003629 | 0.000000 | 0.011900 | 0.000000 |
6 | 0.000000 | 0.000000 | 0.000000 | 0.000570 | 0.000000 | 0.000522 | 0.001567 | 0.000000 | 0.029214 | 0.000441 | ... | 0.000000 | 0.000000 | 0.077904 | 0.000811 | 0.002090 | 0.000000 | 0.004463 | 0.000959 | 0.000000 | 0.002281 |
7 | 0.000910 | 0.000000 | 0.000000 | 0.000000 | 0.004986 | 0.000761 | 0.004568 | 0.000000 | 0.031930 | 0.005142 | ... | 0.000000 | 0.000699 | 0.076278 | 0.017739 | 0.009898 | 0.000000 | 0.005322 | 0.000000 | 0.000000 | 0.000000 |
8 | 0.000789 | 0.005939 | 0.000000 | 0.002161 | 0.004322 | 0.001980 | 0.000000 | 0.000000 | 0.017937 | 0.004457 | ... | 0.010559 | 0.021208 | 0.047150 | 0.004612 | 0.000000 | 0.027717 | 0.006662 | 0.001818 | 0.015846 | 0.000720 |
9 | 0.003283 | 0.001373 | 0.001641 | 0.005996 | 0.000000 | 0.001373 | 0.000000 | 0.001499 | 0.008532 | 0.006956 | ... | 0.004120 | 0.016393 | 0.061858 | 0.002133 | 0.000000 | 0.016480 | 0.012798 | 0.006305 | 0.011992 | 0.000000 |
10 | 0.000000 | 0.002337 | 0.001862 | 0.000850 | 0.000000 | 0.000779 | 0.000779 | 0.000850 | 0.006048 | 0.000000 | ... | 0.008567 | 0.025029 | 0.078025 | 0.003024 | 0.000000 | 0.018692 | 0.004839 | 0.002145 | 0.002550 | 0.000850 |
11 rows × 7102 columns
#easy to program, but let's use a robust version from sklearn!
from sklearn.metrics.pairwise import cosine_similarity
similarity=cosine_similarity(document_term_matrix)
#Note here that the `cosine_similiary` can take
#an entire matrix as its argument
similarity_df=pd.DataFrame(similarity, index=names, columns=names)
similarity_df.ix[1].order(ascending=False)
Austen_Sense.txt 1.000000 Austen_Pride.txt 0.828285 Austen_Emma.txt 0.801833 ABronte_Tenant.txt 0.777003 ABronte_Agnes.txt 0.750176 CBronte_Jane.txt 0.739302 CBronte_Villette.txt 0.713728 Dickens_David.txt 0.704389 Dickens_Hard.txt 0.698309 CBronte_Professor.txt 0.671603 Dickens_Bleak.txt 0.666095 Name: Austen_Sense.txt, dtype: float64
you've already seen hierarchical clustering
#here's the blackbox
from sklearn.manifold import MDS
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
positions= mds.fit_transform(1-similarity)
positions.shape
(11, 2)
It's an 11 by 2 matrix
OR
simply an (x,y) coordinate pair for each of our texts
def plot_mds(positions, names):
xs, ys=positions[:, 0], positions[:, 1]
for x, y, name in zip(xs, ys, names):
plt.scatter(x,y)
plt.text(x,y,name)
plt.show()
#let's plot it: I've set up a black box
plot_mds(positions,names)
/home/mljones/anaconda/lib/python2.7/site-packages/matplotlib/font_manager.py:1282: UserWarning: findfont: Font family [u'monospace'] not found. Falling back to Bitstream Vera Sans (prop.get_family(), self.defaultFamily[fontext]))
names=[name.replace(".txt", "") for name in names]
plot_mds(positions,names)
What has this got us?
It suggests that even this crude measure of similarity is able to capture something significant.
Note: the axes don't really mean anything
python
package gensim
¶"corpora" = a collection of documents or texts
gensim
likes its documents to be a list of lists of words, not a list of strings
Get the stoplist in the data directory in my github.
our_texts, names=readtextfiles("text_examples/PCCIPtext")
names
['280_EO12977_1995.txt', '249_OTAHoldingtheEdgeAppendices.txt', '332_S106InternetSecurityandPrivacy.txt', '467_GAOElectronidWarfareRadarWarnings28pp.txt', '202_CIPBoard21StepsBooklet.txt', '39_ClintonPolicyCIP_PDD63.txt', '487_GAOElectronicWarefareAFInadequacies34pp.txt', '452_Govt Activities to Protect Elec Grid.2.4.05.txt', '501_GAOAdequacyInfoDefenseIndBasetestimony14pp.txt', '80_cip_report_1.11.txt', '428_GAOMaintainingMilitaryPresenceinIndustrialEnviron62pp.txt', '117_cip_report_2.8.txt', '245_OTAElectronicSurveillance.txt', '6_RL30153.txt', '536_GAOFEMAProgressResponse.txt', '246_OTAElectronicSurveillanceDigitalAge.txt', '396_GAOTerrorismInsurance022702.txt', '12_WinningtheWaronTerror.txt', '541_GAOIndustryChemFacilitiesSecurity.txt', '438_GAODefenseIndustrialBase198115pp.txt', '201_CIAOPracticesforSecuringCIassets.txt', '9_GAOChallengesreport.txt', '103_cip_report_2.10.txt', '436_GAODODIndustrialPreparednessProgramNeeds60pp.txt', '123_HartRudmanPhaseIIIAddendumVol_VI_Intel.txt', '475_GAONatSecurityUseofPDDs20pp.txt', '29_CanadaLessonsLearned0902.txt', '348_GAOAttacksatDefenseComputersRiskMay1996report.txt', '128_SCADAWhitepaperfinal1.txt', '238_OTAComputerBasedNatInfoSystems.txt', '457_GAOStatusDefenseIndustrialBase198524pp.txt', '435_GAODODStarWars4pp.txt', '235_OTAAdvancedNetworkTechnology.txt', '296_DefenseScienceBoardDODRolesVol2A.txt', '511_GAOEmbeddedComputerFA18Software.txt', '517_GAOMilitarySatellitesLeasing.txt', '301_HeritageWhosOnFirst.txt', '490_GAODefenseResearch10labs36pp.txt', '311_S105Counterrorism5YearPlan56pp.txt', '485_GAOEmbeddedComputerSystemsDODsoftware20pp.txt', '462_GAONavyStrategicHomePortingPlanmoretestimony12pp.txt', '121_Hart-RudmanPhaseIIIAddendumEducation_Addendum.txt', '25_GAOCIPImprovingInfoSharingReport0704.txt', '275_EO11051_1962.txt', '471_GAODODChemWarfareShelters26pp.txt', '270_RANDTestimonyTerrorismPublicSurfaceTrans.txt', '31_GAOCIPCommercialSatelliteSecurity.txt', '232_OTA1977TechAssessinBusandGov.txt', '244_OTAElectronicRecordsPrivacy.txt', '237_OTABuildingSecureFuture.txt', '157_S108VirtualThreathearings.txt', '464_GAONationalDefenseStockpile28pp.txt', '529_GAOCombatingTerrorismIntergovernmentalPartnerships.txt', '365_Gilmore.First Report 12.15.99.txt', '19_GAOCIPChallengesTestimonyDacey.txt', '466_GAOInfoMngmntContinentalArmy20pp.txt', '500_GAOAFJammer30pp.txt', '514_GAOIndustrialBaseForeignDependenceRisk.txt', '304_McNairBioTerrorismParkerpaper.txt', '256_OTASelectedElectronicTransferIssues.txt', '456_Risk Mgmt and CI Protection 9.2.04.txt', '539_GAOHomelandSecurityCriticalDesignandImplIssues.txt', '5_CriticalFoundationsPCCIP.txt', '496_GAOChemWarfareRandD26pp.txt', '325_S106CyberAttackPrivacyImplications.txt', '125_CSISSilent-Vector-Brief0903.txt', '10_wp_cip_zeichner.txt', '298_DefenseScienceBoardDODRolesVolumeI.txt', '354_GAOAdvancesandRemainingChallengesKeyInfra.txt', '241_OTAEffectsofTechnonFinancialSystems.txt', '292_CSISAssessingtheRisksofCyberTerrorismLewis2002.txt', '353_BrookingsOrszagCIPtestimony0903.txt', '437_GAODefenseScienceBoardTaskForceEmbedded7pp.txt', '468_GAODODInteroperabilityC3Systems32pp.txt', '448_Computer Security.Summary of Federal Laws etc. 4.16.04.txt', '532_GAOContinuityofOpsEssentialServices.txt', '239_OTADefendingSecretsSharingData.txt', '107_cip_report_3.7.txt', '422_GAORestructuringNeededDODIndustrial36pp.txt', '84_cip_report_2.3.txt', '515_GAOInfoTechnologyNavyConsolidation.txt', '250_OTAInfoSecPrivacyNetworkedSystems.txt', '498_GAOBaseSecurityAllegedProblemsDepotff16pp.txt', '253_OTARedesigningDefense.txt', '391_GAOILOVEYOUvirusRhodestestimony051000.txt', '459_GAONSASecurePhones5pp.txt', '519_GAOMobilityRequirements.txt', '470_GAODODIndustrialBaseAmmunition12pp.txt', '520_GAONationalDefenseStockpile1992.txt', '79_cip_report_1.10.txt', '271_RANDTestimonyTerrorismRailSecurity.txt', '47_CouncilonForRelationsHartRudmanTaskForce2002.txt', '266_RANDContainerizedSupplyChain.txt', '81_cip_report_1.12.txt', '122_HartRudmanPhaseIIIAddendumVol_V_DoS.txt', '469_GAODODInteroperability17pp.txt', '87_cip_report_2.6.txt', '15_CIPAPrimerCRS.txt', '483_GAOGPSReceiverProblems20pp.txt', '476_GAOInfoSecurityPresAppointees21pp.txt', '521_WhiteHouseDirectiveonCIIdentGWB2003.txt', '262_OTAStratDefInitiativeanalysis283pp.txt', '542_GAOPotentialTerroristAttacksFinancialMarkets.txt', '423_GAONtlDefExecReserve198310pp.txt', '492_GAODefenseIndSecurityContracts18pp.txt', '248_OTAFedGovInfoTechOversight.txt', '543_GAOProtectionofChemicalWaterInfrastructure2005.txt', '527_GAOCatastropheRisks.txt', '265_RANDConceptsEnhancingCIP.txt', '126_CSISSilentVectorOverview.txt', '152_H108Powerblackoutshearings.txt', '277_EO12656_1988.txt', '352_NDUCompromiseinAnUncompromisingWorld14pp.txt', '510_GAOElectronicWarfareRadarDuplication.txt', '486_GAOElectronicWarfaretesting.txt', '236_OTABallasticMissiles328pp..txt', '278_EO12829_1993.txt', '506_GAOCorpsofEngineersSupport.txt', '274_8636.txt', '477_GAOInfoSecurityNonDisclosureUse199136pp.txt', '255_OTASciTechFirstAmendment.txt', '82_cip_report_2.1.txt', '34_GAOCIPSignifChallengesTestimonyWillemsson091201.txt', '393_GAOInfoSecurityMajorDeptsAgencies.txt', '7_RL32531.txt', '242_OTAElectronicBullsandBears.txt', '302_HouseCommitteeCybersecurityfortheHomeland.txt', '355_GAOBioterrorism111903testimony.txt', '494_GAOCountermeasuresRadarSurvivability16pp.txt', '390_GAOILOVEYOUvirusBrock2002testimony.txt', '11_13839.txt', '453_Hmld Sec.Bnkg and Fin Infra Continuity.3.16.04.txt', '484_GAOEmbeddedComputerTechtestimony7pp.txt', '200_USAFRinaldiSharingtheKnowledge.txt', '219_NPC_Security_GuidanceNatGasOil.txt', '118_cip_report_3.6.txt']
our_texts=data_cleanse(our_texts)
#improved stoplist--may be too complete
stop=[]
with open('stoplist-multilingual') as f:
stop=f.readlines()
stop=[word.strip('\n') for word in stop]
texts = [[word for word in document.lower().split() if word not in stop] for document in our_texts] #gensim requires list of list of words in documents
from gensim import corpora, models, similarities, matutils
"""gensim includes its own vectorizing tools"""
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
number_topics=40
model = models.LdaMulticore(corpus, id2word=dictionary, num_topics=number_topics, passes=10)
#multicore faster but seems to crash more in my experience
# alternatively, for single core process
#model=models.LdaModel(corpus, id2word=dictionary, num_topics=number_topics, passes=10)
model.show_topics()
[u'0.011*interoperability + 0.011*technology + 0.009*assessment + 0.007*joint + 0.006*air + 0.006*services + 0.006*hr + 0.006*dr + 0.005*c3 + 0.005*public', u'0.014*federal + 0.013*security + 0.011*agencies + 0.010*government + 0.010*agency + 0.009*technology + 0.008*national + 0.008*management + 0.007*data + 0.007*systems', u'0.010*fbi + 0.009*department + 0.009*training + 0.008*local + 0.007*terrorist + 0.007*federal + 0.007*crime + 0.006*terrorism + 0.006*agencies + 0.006*law', u'0.008*technology + 0.008*security + 0.007*government + 0.007*national + 0.006*defense + 0.005*software + 0.005*program + 0.004*systems + 0.004*industry + 0.004*development', u'0.016*stockpile + 0.014*dod + 0.008*materials + 0.006*national + 0.005*year + 0.005*data + 0.004*market + 0.004*sensitivity + 0.004*percent + 0.004*material', u'0.000*vought + 0.000*renical + 0.000*regi + 0.000*regul + 0.000*rei + 0.000*rej + 0.000*rel + 0.000*remi + 0.000*reoubl + 0.000*responsi', u'0.008*6633 + 0.007*y2k + 0.007*systems + 0.006*security + 0.006*critical + 0.006*sjud4 + 0.006*2000 + 0.006*po + 0.006*psn + 0.006*00000', u'0.015*market + 0.014*trading + 0.014*stock + 0.013*markets + 0.013*futures + 0.012*securities + 0.009*exchange + 0.007*clearing + 0.007*options + 0.006*price', u'0.001*functions5 + 0.001*year25 + 0.000*shuts + 0.000*warehouses + 0.000*3300024 + 0.000*sectors32 + 0.000*396 + 0.000*repl + 0.000*reporti + 0.000*reoubl', u'0.014*software + 0.011*bmd + 0.010*space + 0.008*battle + 0.007*defense + 0.007*systems + 0.006*based + 0.006*weapons + 0.006*phase + 0.005*soviet']
model.show_topics(number_topics,10,formatted=False)[:4]
[[(1.4033118158854899e-05, u'vought'), (1.4033118158854899e-05, u'renical'), (1.4033118158854899e-05, u'regi'), (1.4033118158854899e-05, u'regul'), (1.4033118158854899e-05, u'rei'), (1.4033118158854899e-05, u'rej'), (1.4033118158854899e-05, u'rel'), (1.4033118158854899e-05, u'remi'), (1.4033118158854899e-05, u'reoubl'), (1.4033118158854899e-05, u'responsi')], [(0.011278903088321071, u'security'), (0.010713988785028378, u'privacy'), (0.010062427105057696, u'data'), (0.0096337561024090971, u'systems'), (0.0087066129030076694, u'electronic'), (0.0067674206434191562, u'federal'), (0.0065141373728968095, u'law'), (0.0064956018808460221, u'government'), (0.0054754369260244041, u'technology'), (0.0053667785813350929, u'public')], [(0.0016946640945281974, u'stall'), (0.0012738158095899597, u'comma'), (0.00085272813758866569, u'combatants'), (0.00085272813758866569, u'cdn'), (0.00085272813758866569, u'platoon'), (0.00085272813758866569, u'caliber'), (0.00085272813758866569, u'060859'), (0.00085272813758866569, u'062051'), (0.00085272813758866569, u'chap'), (0.00085272813758866569, u'9004')], [(0.023668183380266692, u'defense'), (0.012588845173930892, u'production'), (0.011403461621551791, u'base'), (0.010966735571204044, u'military'), (0.0073688828589395412, u'future'), (0.0069157226112579046, u'industrial'), (0.0065580172971887278, u'technology'), (0.0056887650854924128, u'maintenance'), (0.0054510380295830635, u'government'), (0.0054495853028934914, u'systems')]]
topics_indexed=[[b for (a,b) in topics] for topics in model.show_topics(number_topics,10,formatted=False)]
topics_indexed=pd.DataFrame(topics_indexed)
topics_indexed
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | |
---|---|---|---|---|---|---|---|---|---|---|
0 | vought | renical | regi | regul | rei | rej | rel | remi | reoubl | responsi |
1 | security | privacy | data | systems | electronic | federal | law | government | technology | public |
2 | stall | comma | combatants | cdn | platoon | caliber | 060859 | 062051 | chap | 9004 |
3 | defense | production | base | military | future | industrial | technology | maintenance | government | systems |
4 | vought | renical | regi | regul | rei | rej | rel | remi | reoubl | responsi |
5 | federal | security | agencies | government | agency | technology | national | management | data | systems |
6 | vought | renical | regi | regul | rei | rej | rel | remi | reoubl | responsi |
7 | 00 | nsc | stockpile | study | defense | goals | materials | assumptions | proposed | foreign |
8 | security | dod | homeland | defense | guard | national | missions | dsb | support | technology |
9 | terrorism | weapons | panel | terrorist | national | biological | federal | threat | 1999 | terrorists |
10 | stockpile | dod | materials | national | year | data | market | sensitivity | percent | material |
11 | defense | soviet | bmd | nuclear | strategic | abm | united | treaty | weapons | defenses |
12 | air | force | navy | army | systems | equipment | cost | officials | personnel | support |
13 | market | trading | stock | markets | futures | securities | exchange | clearing | options | price |
14 | organizations | attacks | financial | markets | operations | securities | market | telecommunications | continuity | trading |
15 | vought | renical | regi | regul | rei | rej | rel | remi | reoubl | responsi |
16 | chemical | report | program | defense | warfare | biological | reported | annual | dod | summaries |
17 | infrastructure | critical | security | national | sector | government | cyber | protection | systems | private |
18 | functions5 | year25 | shuts | warehouses | 3300024 | sectors32 | 396 | repl | reporti | reoubl |
19 | industrial | planning | defense | production | base | program | preparedness | mobilization | dod | requirements |
20 | science | teachers | education | teacher | national | commission | mathematics | pipeline | school | public |
21 | food | terrorism | bioterrorism | biological | 1999 | agriculture | national | agricultural | federal | emergency |
22 | materials | hazardous | transportation | federal | training | regulations | dot | response | emergency | safety |
23 | interoperability | technology | assessment | joint | air | services | hr | dr | c3 | public |
24 | 6633 | y2k | systems | security | critical | sjud4 | 2000 | po | psn | 00000 |
25 | software | bmd | space | battle | defense | systems | based | weapons | phase | soviet |
26 | ati | program | army | chemi | cl | wi | dod | gao | defense | ns |
27 | air | force | production | testing | operational | jammers | alq | receiver | aircraft | sets |
28 | security | homeland | cip | report | national | project | infrastructure | 2003 | risk | critical |
29 | technology | security | government | national | defense | software | program | systems | industry | development |
30 | financial | services | systems | service | industry | institutions | market | technology | credit | funds |
31 | vought | renical | regi | regul | rei | rej | rel | remi | reoubl | responsi |
32 | gao | federal | security | pki | agencies | facilities | chemical | water | key | systems |
33 | insurance | catastrophe | insurers | risk | coverage | terrorism | losses | reinsurance | companies | natural |
34 | security | federal | agencies | government | department | homeland | agency | dhs | record | records |
35 | air | force | systems | warfare | equipment | electronic | test | support | gao | combat |
36 | intelligence | secretary | policy | department | national | staff | security | affairs | issues | office |
37 | reproduced | reproduction | copyright | owner | prohibited | infrastructure | compilation | library | documents | 51 |
38 | fbi | department | training | local | terrorist | federal | crime | terrorism | agencies | law |
39 | vought | renical | regi | regul | rei | rej | rel | remi | reoubl | responsi |
So which topics most significant for each document? Pass a bag of words version of each document to the model.
model[dictionary.doc2bow(texts[1])]
[(3, 0.19718915678001858), (29, 0.79207183594768116)]
Let's find them for every document--with a list comprehension, of course
primarytopics=[model[dictionary.doc2bow(text)] for text in texts]
make it pretty with a list comprehension
import numpy as np
primarytopics_matrix=pd.DataFrame(np.matrix([matutils.sparse2full(primarytopic, number_topics) for primarytopic in primarytopics]))
primarytopics_matrix.ix[18].plot(kind="bar")
<matplotlib.axes._subplots.AxesSubplot at 0x7fae034e6c90>
primarytopics_matrix.ix[18].plot(kind="bar", title=names[18])
<matplotlib.axes._subplots.AxesSubplot at 0x7fae0339a250>
topics_indexed.ix[[17,21,32]]
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | |
---|---|---|---|---|---|---|---|---|---|---|
17 | infrastructure | critical | security | national | sector | government | cyber | protection | systems | private |
21 | food | terrorism | bioterrorism | biological | 1999 | agriculture | national | agricultural | federal | emergency |
32 | gao | federal | security | pki | agencies | facilities | chemical | water | key | systems |
primarytopics_matrix.ix[4].plot(kind="bar", title=names[4])
<matplotlib.axes._subplots.AxesSubplot at 0x7fadf8ffaa90>
topics_indexed.ix[[1,17,28]]
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | |
---|---|---|---|---|---|---|---|---|---|---|
1 | security | privacy | data | systems | electronic | federal | law | government | technology | public |
17 | infrastructure | critical | security | national | sector | government | cyber | protection | systems | private |
28 | security | homeland | cip | report | national | project | infrastructure | 2003 | risk | critical |
topics_indexed.ix[28]
0 security 1 homeland 2 cip 3 report 4 national 5 project 6 infrastructure 7 2003 8 risk 9 critical Name: 28, dtype: object
primarytopics_matrix[28].plot(kind="bar")
<matplotlib.axes._subplots.AxesSubplot at 0x7fadf8c84610>
primarytopics_matrix[28].plot(kind="bar", title=str(topics_indexed.ix[28]))
<matplotlib.axes._subplots.AxesSubplot at 0x7fadebedde10>
#that's ugly!