import whtranscripts
import pandas as pd
from datetime import datetime
%matplotlib inline
import matplotlib as mpl
import mplstyle, mplstyle.styles.simple
BLUE = "#0077EE"
mplstyle.set(mplstyle.styles.simple)
mplstyle.set({
"figure.figsize": (10, 6),
"axes": {
"color_cycle": [ "teal", "red" ],
},
"lines": {
"linewidth": 2,
"markersize": 0
}
})
conferences = whtranscripts.Conference.from_dir("president_speech_notebooks/press_conference_data")
all_passages = [x for b in conferences for x in b.passages]
passages_df = pd.DataFrame(all_passages, columns=["passages"])
passages_df["president"] = passages_df["passages"].apply(lambda x: x.transcript.president)
passages_df["speaker"] = passages_df["passages"].apply(lambda x: x.speaker)
passages_df["datetime"] = passages_df["passages"].apply(lambda x: datetime(x.transcript.date.year, x.transcript.date.month, x.transcript.date.day))
def is_president(row):
if row["speaker"] and "The President" in row["speaker"]\
and "Secretary" not in row["speaker"]:
return True
elif row["speaker"] == "The. President" or row["speaker"] == "Mr. President":
return True
elif row["speaker"] and row["president"].split()[-1] in row["speaker"]\
and "Mrs." not in row["speaker"] and "Governor" not in row["speaker"]:
return True
else:
return False
passages_df["is_president"] = passages_df.apply(lambda x: is_president(x), axis=1)
presidents_df = passages_df[passages_df["is_president"]]
presidents_df["word_count"] = presidents_df["passages"].apply(lambda x: x.get_word_count())
presidents_df["youknow"] = presidents_df["passages"].apply(lambda x: x.count_occurrences("you know"))
presidents_df["folks"] = presidents_df["passages"].apply(lambda x: x.count_occurrences("folks"))
presidents_df["okay"] = presidents_df["passages"].apply(lambda x: x.count_occurrences("okay"))
presidents_sum = pd.DataFrame(presidents_df.groupby("president").sum())
presidents_sum["folks_per_10000"] = presidents_sum.apply(lambda x: 10000 * x["folks"] / x["word_count"], axis=1)
presidents_sum["okay_per_10000"] = presidents_sum.apply(lambda x: 10000 * x["okay"] / x["word_count"], axis=1)
presidents_sum["youknow_per_10000"] = presidents_sum.apply(lambda x: 10000 * x["youknow"] / x["word_count"], axis=1)
ax = presidents_sum.sort("okay_per_10000", ascending=True)["okay_per_10000"].plot(kind="barh",
figsize=(10, 8), width=0.75, color=BLUE, alpha=0.75)
ax.yaxis.grid(False)
ax.set_title("'Okay' Per 10,000 Words\n", fontsize=22, linespacing=0.5)
ax.set_ylabel("")
mpl.pyplot.setp(ax.get_xticklabels(), fontsize=14)
mpl.pyplot.setp(ax.get_yticklabels(), fontsize=16)
pass
ax = presidents_sum.sort("youknow_per_10000", ascending=True)["youknow_per_10000"].plot(kind="barh",
figsize=(10, 8), width=0.75, color=BLUE, alpha=0.75)
ax.yaxis.grid(False)
ax.set_title("'You Know' Per 10,000 Words\n", fontsize=22, linespacing=0.5)
ax.set_ylabel("")
mpl.pyplot.setp(ax.get_xticklabels(), fontsize=14)
mpl.pyplot.setp(ax.get_yticklabels(), fontsize=16)
pass
def get_term_freq(df, term, resampler="AS"):
_ = df.set_index("datetime")
total_words = _["word_count"].resample(resampler, how="sum")
freq_count = _["passages"].apply(lambda x: x.count_occurrences(term))\
.resample(resampler, how="sum")
return 10000 * freq_count / total_words
obama_df = presidents_df[presidents_df["president"] == "Barack Obama"]
ax = get_term_freq(presidents_df, "folks").plot(label="Previous Presidents", color="#AAAAAA", alpha=1, kind="area")
get_term_freq(obama_df, "folks").plot(label="Obama", color=BLUE, alpha=1, kind="area")
ax.set_title("Presidential Use of 'Folks'", fontsize=20)
ax.set_xlabel("Year")
ax.set_ylabel("Use Per 10,000 Words")
ax.set_ylim(0,14)
ax.legend(loc="upper left")
pass
ax = get_term_freq(presidents_df, "okay").plot(label="Previous Presidents", color="#AAAAAA", alpha=1, kind="area")
get_term_freq(obama_df, "okay").plot(label="Obama", color=BLUE, alpha=1, kind="area")
ax.set_title("Presidential Use of 'Okay'", fontsize=20)
ax.set_xlabel("Year")
ax.set_ylabel("Use Per 10,000 Words")
ax.set_ylim(0,8)
ax.legend(loc="upper left")
pass