%matplotlib inline
%run lexique.py

sql("SELECT * FROM lexique WHERE lemme = 'avoir' LIMIT 5")

sql("SELECT * FROM lemme ORDER BY freqlivres DESC LIMIT 5")

sql("SELECT * FROM lemme_simple ORDER BY freqlivres DESC LIMIT 5")

sql("SELECT * FROM lemme_simple ORDER BY freqfilms2 DESC LIMIT 5")

cgram_freq = sql("""
SELECT cgram, SUM(freqfilms2) AS freqfilms2, SUM(freqlivres) AS freqlivres
  FROM lemme GROUP BY cgram
""", index_col='cgram')
cgram_freq

cgram_freq_summary = cgram_freq.groupby(lambda x: x[0:3]).sum()
plt.figure(figsize=(7,7))
plt.subplot(aspect=True)
plt.pie(cgram_freq_summary.freqfilms2, labels=cgram_freq_summary.index.values, colors=colors)
plt.title("Parts of speech")

coverage = sql("""
SELECT lemme, freqfilms2 FROM lemme_simple
  ORDER BY freqfilms2 DESC""")
coverage.index += 1
coverage['film_coverage'] = \
  100*coverage['freqfilms2'].cumsum() / coverage['freqfilms2'].sum()
del coverage['lemme']
del coverage['freqfilms2']
coverage[0:5]

book_coverage = sql("""
SELECT lemme, freqlivres FROM lemme_simple
  ORDER BY freqlivres DESC""")
book_coverage.index += 1
coverage['book_coverage'] = \
  100*book_coverage['freqlivres'].cumsum() / book_coverage['freqlivres'].sum()
coverage[0:5]

plt.plot(coverage.index.values, coverage.film_coverage, label="Film Coverage")
plt.plot(coverage.index.values, coverage.book_coverage, label="Book Coverage")
plt.legend(loc = 'lower right')
plt.title('Text Coverage')
plt.xlabel('Vocabulary size')
plt.ylabel('% coverage')
plt.xlim((0,10000))

coverage.loc[[250, 500, 1000, 2000, 4000, 8000, 16000], :]

# Merge related cgrams, sum frequency over (cgram, lemme) groups,
# and sort by (cgram,freqfilms2).
cgram_lemme_freq = sql("""
SELECT cgram, SUM(freqfilms2) AS freqfilms2
  FROM (SELECT CASE WHEN cgram='AUX' THEN 'VER'
                    ELSE SUBSTR(cgram, 1, 3)
                    END AS cgram,
               lemme, freqfilms2
          FROM lemme)
  GROUP BY cgram, lemme
  ORDER BY cgram, freqfilms2 DESC
""")

# Convert freqfilms2 to a cumulative percentage over each cgram group.
cgram_col = cgram_lemme_freq['cgram']
normalized_freq = cgram_lemme_freq.groupby(cgram_col).transform(lambda x: x/x.sum())
cumulative_freq = normalized_freq.groupby(cgram_col).cumsum()
cgram_lemme_freq['freqfilms2'] = 100.0*cumulative_freq

# Sequentially number the rows within each cgram group so we can see the
# vocabulary size corresponding to each cumulative percentage.
cgram_lemme_freq['rang'] = cgram_lemme_freq.groupby(cgram_col).cumcount()+1

# Index by cgram group, and vocabulary size within the group. Uncomment the
# last line to view the data.
cgram_lemme_freq.set_index(['cgram', 'rang'], inplace = True)
#cgram_lemme_freq

def plot_cgrams(labels):
    for key in labels.keys():
        cgram_group = cgram_lemme_freq.loc[key]
        plt.plot(cgram_group.index.values, cgram_group.freqfilms2, label=labels[key])
    plt.legend(loc = 'lower right')
    plt.title('Text Coverage by Part of Speech (films)')
    plt.xlabel('Words known by part of speech')
    plt.ylabel('% coverage')
    plt.ylim((0,100))
    plt.axhline(y=90, color='k', ls='dashed')

plt.figure(figsize=(12,4))
    
plt.subplot(121)
small_cgram_labels = {
    'PRO': 'Pronouns',
    'ADV': 'Adverbs',
    'PRE': 'Prepositions',
    'CON': 'Conjuctions',
    'ART': 'Articles'
}
plot_cgrams(small_cgram_labels)
plt.xlim((0,150))

plt.subplot(122)
large_cgram_labels = {
    'NOM': 'Nouns',
    'VER': 'Verbs',
    'ADJ': 'Adjectives'
}
plot_cgrams(large_cgram_labels)
plt.xlim((0,10000))

# Only include the parts of speech used in our graph.
cgram_labels = small_cgram_labels.copy()
cgram_labels.update(large_cgram_labels)
interesting = cgram_lemme_freq.loc[cgram_labels.keys()]

# We'll use this to build a list of columns in our final table.
columns = []

# Calculate minimum number words for a given percentage of coverage.
for threshold in [75,90,95,98,99,99.5]:
    # Discard all the rows below our threshold.
    over_threshold = interesting[interesting['freqfilms2'] >= threshold]
    
    # Take the first row that remains.
    over_threshold.reset_index(inplace=True)
    over_threshold.set_index('cgram', inplace=True)
    first_over = over_threshold.groupby(level=0).first()
    
    # Keep only a single column named after our threashold.
    del first_over['freqfilms2']
    first_over.rename(columns={'rang': '%r%%' % threshold}, inplace=True)
    columns.append(first_over)

# Join all the columns together.
table = columns[0].join(columns[1:])

# Clean up the table a bit and add a total
table.index.names = ['Part of speech']
table.index = table.index.map(lambda i: cgram_labels[i])
table.loc['TOTAL'] = table.sum()
table

verbs = sql("SELECT * FROM verbe ORDER BY freqfilms2 DESC")
verbs[0:15]

plt.figure(figsize=(8,8))

plt.subplot(121, aspect=True)
group_freq = verbs.groupby(verbs['groupe']).sum()
plt.pie(group_freq.freqfilms2, labels=group_freq.index.values, colors=colors)
plt.title("Verb group frequency")

plt.subplot(122, aspect=True)
group_size = verbs.groupby(verbs['groupe']).count()
plt.pie(group_size.lemme, labels=group_size.index.values, colors=colors)
plt.title("Verb group size (words)")

# Extract the columns we need, and get rid of 'aller'.
group_freq = verbs[['groupe', 'lemme', 'freqfilms2']].copy()
group_freq = group_freq[group_freq['groupe'] != 'aller']

# Calculate coverage percentages for frequency ranks in each group.
groupe_col = group_freq['groupe']
normalized_freq = group_freq.groupby(groupe_col).transform(lambda x: x/x.sum())
cumulative_freq = 100.0*normalized_freq.groupby(groupe_col).cumsum()
group_freq['freqfilms2'] = cumulative_freq
group_freq['rang'] = group_freq.groupby(groupe_col).cumcount()+1
group_freq.set_index(['groupe', 'rang'], inplace=True)
group_freq[0:10]

# Sigh. My database is in French, and my libraries are in English.
# There's no way to avoid coding in franglais, I fear.
for group in ['er', 'ir', 're']:
    g = group_freq.loc[group]
    plt.plot(g.index.values, g.freqfilms2, label=group)
plt.title('Verb Coverage by Group')
plt.legend(loc = 'lower right')
plt.xlabel('Verbs known in group')
plt.ylabel('% coverage')
plt.xlim((1,100))
plt.ylim((0,100))

def html_for_group(groupe):
    lst = ', '.join(group_freq.loc[groupe].loc[1:40]['lemme'].tolist())
    return '<p><i>-%s</i> verbs: %s.</p>' % (groupe, lst)
HTML("<p><i>-er</i> verbs: aller.</p>" + html_for_group('ir') + html_for_group('re'))

verbs2 = sql("""
SELECT conjugaison.nom AS conjugaison, lemme, freqfilms2, resume
  FROM verbe
  LEFT OUTER JOIN conjugaison
    ON verbe.conjugaison = conjugaison.nom
  ORDER BY freqfilms2 DESC
""")
verbs2['freqfilms2'] = 100 * verbs2['freqfilms2'] / verbs2['freqfilms2'].sum()
def summarize_conjugator(grp):
    return pd.Series(dict(exemples=', '.join(grp.lemme[0:5]),
                          compte=grp.lemme.count(),
                          freqfilms2=grp.freqfilms2.sum(),
                          resume=grp.resume.iloc[0]))
conjugators = verbs2.groupby('conjugaison').apply(summarize_conjugator).sort('freqfilms2', ascending=False)
conjugators.reset_index(inplace=True)
conjugators.index.names = ['rang']
conjugators.reset_index(inplace=True)
conjugators['rang'] = conjugators['rang'] + 1
conjugators['freqfilms2'] = conjugators['freqfilms2'].cumsum()
conjugators.set_index('rang', inplace=True)
save_tsv('conjugators.tsv', conjugators)
conjugators

plt.plot(conjugators.index.values, conjugators.freqfilms2)
plt.title('Verb Coverage by Conjugator')
#plt.legend(loc = 'lower right')
plt.xlabel('Verb conjugations known')
plt.ylabel('% coverage')
plt.ylim((0,100))
plt.xlim((1,60))