import whtranscripts import pandas as pd from datetime import datetime %matplotlib inline import matplotlib as mpl import mplstyle, mplstyle.styles.simple BLUE = "#0077EE" mplstyle.set(mplstyle.styles.simple) mplstyle.set({ "figure.figsize": (10, 6), "axes": { "color_cycle": [ "teal", "red" ], }, "lines": { "linewidth": 2, "markersize": 0 } }) conferences = whtranscripts.Conference.from_dir("president_speech_notebooks/press_conference_data") all_passages = [x for b in conferences for x in b.passages] passages_df = pd.DataFrame(all_passages, columns=["passages"]) passages_df["president"] = passages_df["passages"].apply(lambda x: x.transcript.president) passages_df["speaker"] = passages_df["passages"].apply(lambda x: x.speaker) passages_df["datetime"] = passages_df["passages"].apply(lambda x: datetime(x.transcript.date.year, x.transcript.date.month, x.transcript.date.day)) def is_president(row): if row["speaker"] and "The President" in row["speaker"]\ and "Secretary" not in row["speaker"]: return True elif row["speaker"] == "The. President" or row["speaker"] == "Mr. President": return True elif row["speaker"] and row["president"].split()[-1] in row["speaker"]\ and "Mrs." not in row["speaker"] and "Governor" not in row["speaker"]: return True else: return False passages_df["is_president"] = passages_df.apply(lambda x: is_president(x), axis=1) presidents_df = passages_df[passages_df["is_president"]] presidents_df["word_count"] = presidents_df["passages"].apply(lambda x: x.get_word_count()) presidents_df["youknow"] = presidents_df["passages"].apply(lambda x: x.count_occurrences("you know")) presidents_df["folks"] = presidents_df["passages"].apply(lambda x: x.count_occurrences("folks")) presidents_df["okay"] = presidents_df["passages"].apply(lambda x: x.count_occurrences("okay")) presidents_sum = pd.DataFrame(presidents_df.groupby("president").sum()) presidents_sum["folks_per_10000"] = presidents_sum.apply(lambda x: 10000 * x["folks"] / x["word_count"], axis=1) presidents_sum["okay_per_10000"] = presidents_sum.apply(lambda x: 10000 * x["okay"] / x["word_count"], axis=1) presidents_sum["youknow_per_10000"] = presidents_sum.apply(lambda x: 10000 * x["youknow"] / x["word_count"], axis=1) ax = presidents_sum.sort("okay_per_10000", ascending=True)["okay_per_10000"].plot(kind="barh", figsize=(10, 8), width=0.75, color=BLUE, alpha=0.75) ax.yaxis.grid(False) ax.set_title("'Okay' Per 10,000 Words\n", fontsize=22, linespacing=0.5) ax.set_ylabel("") mpl.pyplot.setp(ax.get_xticklabels(), fontsize=14) mpl.pyplot.setp(ax.get_yticklabels(), fontsize=16) pass ax = presidents_sum.sort("youknow_per_10000", ascending=True)["youknow_per_10000"].plot(kind="barh", figsize=(10, 8), width=0.75, color=BLUE, alpha=0.75) ax.yaxis.grid(False) ax.set_title("'You Know' Per 10,000 Words\n", fontsize=22, linespacing=0.5) ax.set_ylabel("") mpl.pyplot.setp(ax.get_xticklabels(), fontsize=14) mpl.pyplot.setp(ax.get_yticklabels(), fontsize=16) pass def get_term_freq(df, term, resampler="AS"): _ = df.set_index("datetime") total_words = _["word_count"].resample(resampler, how="sum") freq_count = _["passages"].apply(lambda x: x.count_occurrences(term))\ .resample(resampler, how="sum") return 10000 * freq_count / total_words obama_df = presidents_df[presidents_df["president"] == "Barack Obama"] ax = get_term_freq(presidents_df, "folks").plot(label="Previous Presidents", color="#AAAAAA", alpha=1, kind="area") get_term_freq(obama_df, "folks").plot(label="Obama", color=BLUE, alpha=1, kind="area") ax.set_title("Presidential Use of 'Folks'", fontsize=20) ax.set_xlabel("Year") ax.set_ylabel("Use Per 10,000 Words") ax.set_ylim(0,14) ax.legend(loc="upper left") pass ax = get_term_freq(presidents_df, "okay").plot(label="Previous Presidents", color="#AAAAAA", alpha=1, kind="area") get_term_freq(obama_df, "okay").plot(label="Obama", color=BLUE, alpha=1, kind="area") ax.set_title("Presidential Use of 'Okay'", fontsize=20) ax.set_xlabel("Year") ax.set_ylabel("Use Per 10,000 Words") ax.set_ylim(0,8) ax.legend(loc="upper left") pass