%matplotlib inline %run lexique.py sql("SELECT * FROM lexique WHERE lemme = 'avoir' LIMIT 5") sql("SELECT * FROM lemme ORDER BY freqlivres DESC LIMIT 5") sql("SELECT * FROM lemme_simple ORDER BY freqlivres DESC LIMIT 5") sql("SELECT * FROM lemme_simple ORDER BY freqfilms2 DESC LIMIT 5") cgram_freq = sql(""" SELECT cgram, SUM(freqfilms2) AS freqfilms2, SUM(freqlivres) AS freqlivres FROM lemme GROUP BY cgram """, index_col='cgram') cgram_freq cgram_freq_summary = cgram_freq.groupby(lambda x: x[0:3]).sum() plt.figure(figsize=(7,7)) plt.subplot(aspect=True) plt.pie(cgram_freq_summary.freqfilms2, labels=cgram_freq_summary.index.values, colors=colors) plt.title("Parts of speech") coverage = sql(""" SELECT lemme, freqfilms2 FROM lemme_simple ORDER BY freqfilms2 DESC""") coverage.index += 1 coverage['film_coverage'] = \ 100*coverage['freqfilms2'].cumsum() / coverage['freqfilms2'].sum() del coverage['lemme'] del coverage['freqfilms2'] coverage[0:5] book_coverage = sql(""" SELECT lemme, freqlivres FROM lemme_simple ORDER BY freqlivres DESC""") book_coverage.index += 1 coverage['book_coverage'] = \ 100*book_coverage['freqlivres'].cumsum() / book_coverage['freqlivres'].sum() coverage[0:5] plt.plot(coverage.index.values, coverage.film_coverage, label="Film Coverage") plt.plot(coverage.index.values, coverage.book_coverage, label="Book Coverage") plt.legend(loc = 'lower right') plt.title('Text Coverage') plt.xlabel('Vocabulary size') plt.ylabel('% coverage') plt.xlim((0,10000)) coverage.loc[[250, 500, 1000, 2000, 4000, 8000, 16000], :] # Merge related cgrams, sum frequency over (cgram, lemme) groups, # and sort by (cgram,freqfilms2). cgram_lemme_freq = sql(""" SELECT cgram, SUM(freqfilms2) AS freqfilms2 FROM (SELECT CASE WHEN cgram='AUX' THEN 'VER' ELSE SUBSTR(cgram, 1, 3) END AS cgram, lemme, freqfilms2 FROM lemme) GROUP BY cgram, lemme ORDER BY cgram, freqfilms2 DESC """) # Convert freqfilms2 to a cumulative percentage over each cgram group. cgram_col = cgram_lemme_freq['cgram'] normalized_freq = cgram_lemme_freq.groupby(cgram_col).transform(lambda x: x/x.sum()) cumulative_freq = normalized_freq.groupby(cgram_col).cumsum() cgram_lemme_freq['freqfilms2'] = 100.0*cumulative_freq # Sequentially number the rows within each cgram group so we can see the # vocabulary size corresponding to each cumulative percentage. cgram_lemme_freq['rang'] = cgram_lemme_freq.groupby(cgram_col).cumcount()+1 # Index by cgram group, and vocabulary size within the group. Uncomment the # last line to view the data. cgram_lemme_freq.set_index(['cgram', 'rang'], inplace = True) #cgram_lemme_freq def plot_cgrams(labels): for key in labels.keys(): cgram_group = cgram_lemme_freq.loc[key] plt.plot(cgram_group.index.values, cgram_group.freqfilms2, label=labels[key]) plt.legend(loc = 'lower right') plt.title('Text Coverage by Part of Speech (films)') plt.xlabel('Words known by part of speech') plt.ylabel('% coverage') plt.ylim((0,100)) plt.axhline(y=90, color='k', ls='dashed') plt.figure(figsize=(12,4)) plt.subplot(121) small_cgram_labels = { 'PRO': 'Pronouns', 'ADV': 'Adverbs', 'PRE': 'Prepositions', 'CON': 'Conjuctions', 'ART': 'Articles' } plot_cgrams(small_cgram_labels) plt.xlim((0,150)) plt.subplot(122) large_cgram_labels = { 'NOM': 'Nouns', 'VER': 'Verbs', 'ADJ': 'Adjectives' } plot_cgrams(large_cgram_labels) plt.xlim((0,10000)) # Only include the parts of speech used in our graph. cgram_labels = small_cgram_labels.copy() cgram_labels.update(large_cgram_labels) interesting = cgram_lemme_freq.loc[cgram_labels.keys()] # We'll use this to build a list of columns in our final table. columns = [] # Calculate minimum number words for a given percentage of coverage. for threshold in [75,90,95,98,99,99.5]: # Discard all the rows below our threshold. over_threshold = interesting[interesting['freqfilms2'] >= threshold] # Take the first row that remains. over_threshold.reset_index(inplace=True) over_threshold.set_index('cgram', inplace=True) first_over = over_threshold.groupby(level=0).first() # Keep only a single column named after our threashold. del first_over['freqfilms2'] first_over.rename(columns={'rang': '%r%%' % threshold}, inplace=True) columns.append(first_over) # Join all the columns together. table = columns[0].join(columns[1:]) # Clean up the table a bit and add a total table.index.names = ['Part of speech'] table.index = table.index.map(lambda i: cgram_labels[i]) table.loc['TOTAL'] = table.sum() table verbs = sql("SELECT * FROM verbe ORDER BY freqfilms2 DESC") verbs[0:15] plt.figure(figsize=(8,8)) plt.subplot(121, aspect=True) group_freq = verbs.groupby(verbs['groupe']).sum() plt.pie(group_freq.freqfilms2, labels=group_freq.index.values, colors=colors) plt.title("Verb group frequency") plt.subplot(122, aspect=True) group_size = verbs.groupby(verbs['groupe']).count() plt.pie(group_size.lemme, labels=group_size.index.values, colors=colors) plt.title("Verb group size (words)") # Extract the columns we need, and get rid of 'aller'. group_freq = verbs[['groupe', 'lemme', 'freqfilms2']].copy() group_freq = group_freq[group_freq['groupe'] != 'aller'] # Calculate coverage percentages for frequency ranks in each group. groupe_col = group_freq['groupe'] normalized_freq = group_freq.groupby(groupe_col).transform(lambda x: x/x.sum()) cumulative_freq = 100.0*normalized_freq.groupby(groupe_col).cumsum() group_freq['freqfilms2'] = cumulative_freq group_freq['rang'] = group_freq.groupby(groupe_col).cumcount()+1 group_freq.set_index(['groupe', 'rang'], inplace=True) group_freq[0:10] # Sigh. My database is in French, and my libraries are in English. # There's no way to avoid coding in franglais, I fear. for group in ['er', 'ir', 're']: g = group_freq.loc[group] plt.plot(g.index.values, g.freqfilms2, label=group) plt.title('Verb Coverage by Group') plt.legend(loc = 'lower right') plt.xlabel('Verbs known in group') plt.ylabel('% coverage') plt.xlim((1,100)) plt.ylim((0,100)) def html_for_group(groupe): lst = ', '.join(group_freq.loc[groupe].loc[1:40]['lemme'].tolist()) return '

-%s verbs: %s.

' % (groupe, lst) HTML("

-er verbs: aller.

" + html_for_group('ir') + html_for_group('re')) verbs2 = sql(""" SELECT conjugaison.nom AS conjugaison, lemme, freqfilms2, resume FROM verbe LEFT OUTER JOIN conjugaison ON verbe.conjugaison = conjugaison.nom ORDER BY freqfilms2 DESC """) verbs2['freqfilms2'] = 100 * verbs2['freqfilms2'] / verbs2['freqfilms2'].sum() def summarize_conjugator(grp): return pd.Series(dict(exemples=', '.join(grp.lemme[0:5]), compte=grp.lemme.count(), freqfilms2=grp.freqfilms2.sum(), resume=grp.resume.iloc[0])) conjugators = verbs2.groupby('conjugaison').apply(summarize_conjugator).sort('freqfilms2', ascending=False) conjugators.reset_index(inplace=True) conjugators.index.names = ['rang'] conjugators.reset_index(inplace=True) conjugators['rang'] = conjugators['rang'] + 1 conjugators['freqfilms2'] = conjugators['freqfilms2'].cumsum() conjugators.set_index('rang', inplace=True) save_tsv('conjugators.tsv', conjugators) conjugators plt.plot(conjugators.index.values, conjugators.freqfilms2) plt.title('Verb Coverage by Conjugator') #plt.legend(loc = 'lower right') plt.xlabel('Verb conjugations known') plt.ylabel('% coverage') plt.ylim((0,100)) plt.xlim((1,60))