%matplotlib inline import nltk from collections import defaultdict import pandas as pd import cPickle as pickle # Load pickle files generated by extract_wiki_text.py. These files transformed MediaWiki XML to dictionaries mapping article # titles to article text, cleaning up MediaWiki markup with helper functions from https://github.com/bwbaugh/wikipedia-extractor/. with open('simple.p') as f: sd = pickle.load(f) with open('math.p') as f: md = pickle.load(f) # These are the pages that appeared in both Simple and English Wikipedia. sd.keys() # Documents are lists of sentences sd["Ratio"] md['Ratio'] def text_dict_to_term_dict(d): '''Transform the text document dictionary to a term document matrix by tokenizing, lemmatizing, lowercasing, and picking only lemmas that are all alphabetical. ''' lemmatizer = nltk.WordNetLemmatizer() term_matrix = defaultdict(int) all_counts = 0 for title in d: for paragraph in d[title]: # Tokenize lowercase words tokens = nltk.word_tokenize(paragraph.lower()) # Lemmatize words lemmas = map(lemmatizer.lemmatize, tokens) for lem in lemmas: # Remove non-alphabetical tokens if lem.isalpha(): term_matrix[lem] += 1 all_counts += 1 for x in term_matrix: term_matrix[x] /= float(all_counts) return term_matrix, all_counts # This takes several seconds sd_term_matrix, sd_count = text_dict_to_term_dict(sd) md_term_matrix, md_count = text_dict_to_term_dict(md) len(sd_term_matrix), len(md_term_matrix), sd_count, md_count sd_terms = set(sd_term_matrix) md_terms = set(md_term_matrix) term_difference = {} # Find differences in the term frequencies of the two corpora for term in md_terms.union(sd_terms): term_difference[term] = md_term_matrix[term] - sd_term_matrix[term] sorted_term_difference = sorted(term_difference.items(), key=lambda x: x[1]) len([x for x in sorted_term_difference if x[1] < 0]) term_difference_df = pd.DataFrame(sorted_term_difference, columns=['term', 'term_difference']) term_difference_df['en_tf'] = term_difference_df.term.apply(lambda x: md_term_matrix[x]) term_difference_df['simple_tf'] = term_difference_df.term.apply(lambda x: sd_term_matrix[x]) # Words most characteristic of Simple Wikipedia's math category pages term_difference_df.head(20) # Words most characteristic of English Wikipedia's math category pages term_difference_df.tail(20)[::-1] term_difference_df.to_csv('term_differences.csv', index=False, encoding='utf8')