In [1]:
import whtranscripts
import pandas as pd
from datetime import datetime
In [2]:
%matplotlib inline
import matplotlib as mpl
import mplstyle, mplstyle.styles.simple
BLUE = "#0077EE"
mplstyle.set(mplstyle.styles.simple)
mplstyle.set({ 
    "figure.figsize": (10, 6),
    "axes": {
        "color_cycle": [ "teal", "red" ],
    },
    "lines": {
        "linewidth": 2,
        "markersize": 0
    }
})
In [3]:
conferences = whtranscripts.Conference.from_dir("president_speech_notebooks/press_conference_data")
In [4]:
all_passages = [x for b in conferences for x in b.passages]
passages_df = pd.DataFrame(all_passages, columns=["passages"])
passages_df["president"] = passages_df["passages"].apply(lambda x: x.transcript.president)
passages_df["speaker"] = passages_df["passages"].apply(lambda x: x.speaker)
passages_df["datetime"] = passages_df["passages"].apply(lambda x: datetime(x.transcript.date.year, x.transcript.date.month, x.transcript.date.day))
In [5]:
def is_president(row):
    if row["speaker"] and "The President" in row["speaker"]\
        and "Secretary" not in row["speaker"]:
        return True
    elif row["speaker"] == "The. President" or row["speaker"] == "Mr. President":
        return True
    elif row["speaker"] and row["president"].split()[-1] in row["speaker"]\
        and "Mrs." not in row["speaker"] and "Governor" not in row["speaker"]:
        return True
    else:
        return False

Making the Presidents DataFrame

In [6]:
passages_df["is_president"] = passages_df.apply(lambda x: is_president(x), axis=1)
presidents_df = passages_df[passages_df["is_president"]]
presidents_df["word_count"] = presidents_df["passages"].apply(lambda x: x.get_word_count())
In [7]:
presidents_df["youknow"] = presidents_df["passages"].apply(lambda x: x.count_occurrences("you know"))
presidents_df["folks"] = presidents_df["passages"].apply(lambda x: x.count_occurrences("folks"))
presidents_df["okay"] = presidents_df["passages"].apply(lambda x: x.count_occurrences("okay"))

Time to Make the Bar Graphs

In [8]:
presidents_sum = pd.DataFrame(presidents_df.groupby("president").sum())
In [9]:
presidents_sum["folks_per_10000"] = presidents_sum.apply(lambda x: 10000 * x["folks"] / x["word_count"], axis=1)
presidents_sum["okay_per_10000"] = presidents_sum.apply(lambda x: 10000 * x["okay"] / x["word_count"], axis=1)
presidents_sum["youknow_per_10000"] = presidents_sum.apply(lambda x: 10000 * x["youknow"] / x["word_count"], axis=1)
In [10]:
ax = presidents_sum.sort("okay_per_10000", ascending=True)["okay_per_10000"].plot(kind="barh", 
    figsize=(10, 8), width=0.75, color=BLUE, alpha=0.75)
ax.yaxis.grid(False)
ax.set_title("'Okay' Per 10,000 Words\n", fontsize=22, linespacing=0.5)
ax.set_ylabel("")
mpl.pyplot.setp(ax.get_xticklabels(), fontsize=14)
mpl.pyplot.setp(ax.get_yticklabels(), fontsize=16)
pass
In [11]:
ax = presidents_sum.sort("youknow_per_10000", ascending=True)["youknow_per_10000"].plot(kind="barh", 
    figsize=(10, 8), width=0.75, color=BLUE, alpha=0.75)
ax.yaxis.grid(False)
ax.set_title("'You Know' Per 10,000 Words\n", fontsize=22, linespacing=0.5)
ax.set_ylabel("")
mpl.pyplot.setp(ax.get_xticklabels(), fontsize=14)
mpl.pyplot.setp(ax.get_yticklabels(), fontsize=16)
pass

Time to Make Area Charts

In [12]:
def get_term_freq(df, term, resampler="AS"):
    _ = df.set_index("datetime")
    total_words = _["word_count"].resample(resampler, how="sum")
    freq_count = _["passages"].apply(lambda x: x.count_occurrences(term))\
        .resample(resampler, how="sum")
    return 10000 * freq_count / total_words
In [13]:
obama_df = presidents_df[presidents_df["president"] == "Barack Obama"]
In [14]:
ax = get_term_freq(presidents_df, "folks").plot(label="Previous Presidents", color="#AAAAAA", alpha=1, kind="area")
get_term_freq(obama_df, "folks").plot(label="Obama", color=BLUE, alpha=1, kind="area")
ax.set_title("Presidential Use of 'Folks'", fontsize=20)
ax.set_xlabel("Year")
ax.set_ylabel("Use Per 10,000 Words")
ax.set_ylim(0,14)
ax.legend(loc="upper left")
pass
In [15]:
ax = get_term_freq(presidents_df, "okay").plot(label="Previous Presidents", color="#AAAAAA", alpha=1, kind="area")
get_term_freq(obama_df, "okay").plot(label="Obama", color=BLUE, alpha=1, kind="area")
ax.set_title("Presidential Use of 'Okay'", fontsize=20)
ax.set_xlabel("Year")
ax.set_ylabel("Use Per 10,000 Words")
ax.set_ylim(0,8)
ax.legend(loc="upper left")
pass
In [ ]: