import pandas as pd import sys import whtranscripts import re conferences = whtranscripts.Conference.from_dir("../president_speech_notebooks/press_conference_data") all_passages = [x for b in conferences for x in b.passages] passages = pd.DataFrame(all_passages, columns=["passage"]) passages["date"] = passages["passage"].apply(lambda x: x.transcript.date) passages["speaker"] = passages["passage"].apply(lambda x: x.speaker) passages["text"] = passages["passage"].apply(lambda x: x.text) passages["president"] = passages["passage"].apply(lambda x: x.transcript.president) passages["tokens"] = passages["passage"].apply(lambda x: x.tokens) def is_president(row): if row["speaker"] and "The President" in row["speaker"]\ and "Secretary" not in row["speaker"]: return True elif row["speaker"] == "The. President" or row["speaker"] == "Mr. President": return True elif row["speaker"] and row["president"].split()[-1] in row["speaker"]\ and "Mrs." not in row["speaker"] and "Governor" not in row["speaker"]: return True else: return False passages["is_president"] = passages.apply(lambda x: is_president(x), axis=1) president_passages = passages[passages["is_president"]] president_passages["i"] = president_passages["passage"].apply(lambda x: x.count_token_occurrences("i")) president_passages["me"] = president_passages["passage"].apply(lambda x: x.count_token_occurrences("me")) president_passages["my"] = president_passages["passage"].apply(lambda x: x.count_token_occurrences("my")) president_passages["mine"] = president_passages["passage"].apply(lambda x: x.count_token_occurrences("mine")) president_passages["myself"] = president_passages["passage"].apply(lambda x: x.count_token_occurrences("myself")) president_passages["first_person_singular"] = president_passages.apply(lambda x: x["i"] + x["me"] + x["my"] +\ x["mine"] + x["myself"], axis=1) president_passages["we"] = president_passages["passage"].apply(lambda x: x.count_token_occurrences("we")) president_passages["our"] = president_passages["passage"].apply(lambda x: x.count_token_occurrences("our")) president_passages["ours"] = president_passages["passage"].apply(lambda x: x.count_token_occurrences("ours")) president_passages["ourselves"] = president_passages["passage"].apply(lambda x: x.count_token_occurrences("ourselves")) president_passages["us"] = president_passages["passage"].apply(lambda x: x.count_token_occurrences("us")) president_passages["first_person_plural"] = president_passages.apply(lambda x: x["we"] + x["our"] + x["ours"] + x["ourselves"] + x["us"], axis=1) president_passages["first_person"] = president_passages.apply(lambda x: x["first_person_singular"] + x["first_person_singular"], axis=1) president_passages["you"] = president_passages["passage"].apply(lambda x: x.count_token_occurrences("you")) president_passages["your"] = president_passages["passage"].apply(lambda x: x.count_token_occurrences("your")) president_passages["yours"] = president_passages["passage"].apply(lambda x: x.count_token_occurrences("yours")) president_passages["yourself"] = president_passages["passage"].apply(lambda x: x.count_token_occurrences("yourself")) president_passages["second_person"] = president_passages.apply(lambda x: x["you"] + x["your"] + + x["yours"] + x["yourself"], axis=1) president_passages["they"] = president_passages["passage"].apply(lambda x: x.count_token_occurrences("they")) president_passages["their"] = president_passages["passage"].apply(lambda x: x.count_token_occurrences("their")) president_passages["theirs"] = president_passages["passage"].apply(lambda x: x.count_token_occurrences("theirs")) president_passages["themselves"] = president_passages["passage"].apply(lambda x: x.count_token_occurrences("themselves")) president_passages["third_person"] = president_passages.apply(lambda x: x["they"] + x["their"] + x["theirs"] + x["themselves"], axis=1) president_passages["word_count"] = president_passages["passage"].apply(lambda x: x.get_word_count()) president_analysis = president_passages[["word_count", "tokens", "date", "speaker", "president", "passage", "first_person", "first_person_singular", "first_person_plural", "second_person", "third_person"]] presidents = pd.DataFrame(president_analysis.groupby("president").sum()) round(100.0 * presidents["first_person_singular"].sum() / presidents["word_count"].sum()) presidents["pct_first"] = presidents.apply(lambda x: round(100.0 * x["first_person"] / x["word_count"], 2), axis=1) presidents["pct_first_singular"] = presidents.apply(lambda x: round(100.0 * x["first_person_singular"] / x["word_count"], 2), axis=1) presidents["pct_first_plural"] = presidents.apply(lambda x: round(100.0 * x["first_person_plural"] / x["word_count"], 2), axis=1) presidents[["pct_first_singular", "pct_first_plural", "pct_first", "word_count"]].sort("pct_first_singular", ascending=False) presidents[["pct_first_singular", "pct_first_plural", "pct_first", "word_count"]].sort("pct_first_plural", ascending=False) presidents[["pct_first_singular", "pct_first_plural", "pct_first", "word_count"]].sort("pct_first", ascending=False) %matplotlib inline import mplstyle, mplstyle.styles.simple mplstyle.set(mplstyle.styles.simple) mplstyle.set({ "figure.figsize": (10, 6), "axes": { "color_cycle": [ "teal", "red" ], }, "lines": { "linewidth": 2 } }) import datetime president_analysis["datetime"] = president_analysis["date"].apply(lambda x: datetime.datetime(x.year, x.month, x.day)) def get_term_freq(df, term, resampler="AS"): _ = df.set_index("datetime") total_words = _["word_count"].resample(resampler, how="sum") freq_count = _[term].resample(resampler, how="sum") return (100.0 * freq_count / total_words) ax = get_term_freq(president_analysis, "first_person_singular").plot(kind="line", label="singular", color="r") get_term_freq(president_analysis, "first_person_plural").plot(kind="line", label="plural", color="b") ax.legend(bbox_to_anchor=(0.2, 1)) pass terms_df = pd.DataFrame(get_term_freq(president_analysis, "first_person_singular")) terms_df.columns = ["singular"] terms_df["plural"] = get_term_freq(president_analysis, "first_person_plural") terms_df["date"] = terms_df.index terms_df["year"] = terms_df["date"].apply(lambda x: int(x.year)) terms_df.set_index("year") terms_df[["singular", "plural"]].to_csv("singularVsPlural2.csv")