#!/usr/bin/env python # coding: utf-8 # In[1]: from numpy.random import randn import numpy as np np.random.seed(123) import os import matplotlib.pyplot as plt import pandas as pd plt.rc("figure", figsize=(10, 6)) np.set_printoptions(precision=4) pd.options.display.max_columns = 20 pd.options.display.max_rows = 20 pd.options.display.max_colwidth = 80 # In[2]: path = "datasets/bitly_usagov/example.txt" # In[3]: import json with open(path) as f: records = [json.loads(line) for line in f] # In[4]: time_zones = [rec["tz"] for rec in records] # In[5]: time_zones = [rec["tz"] for rec in records if "tz" in rec] time_zones[:10] # In[6]: def get_counts(sequence): counts = {} for x in sequence: if x in counts: counts[x] += 1 else: counts[x] = 1 return counts # In[7]: from collections import defaultdict def get_counts2(sequence): counts = defaultdict(int) # values will initialize to 0 for x in sequence: counts[x] += 1 return counts # In[8]: counts = get_counts(time_zones) counts["America/New_York"] len(time_zones) # In[9]: def top_counts(count_dict, n=10): value_key_pairs = [(count, tz) for tz, count in count_dict.items()] value_key_pairs.sort() return value_key_pairs[-n:] # In[10]: top_counts(counts) # In[11]: from collections import Counter counts = Counter(time_zones) counts.most_common(10) # In[12]: frame = pd.DataFrame(records) # In[13]: frame.info() frame["tz"].head() # In[14]: tz_counts = frame["tz"].value_counts() tz_counts.head() # In[15]: clean_tz = frame["tz"].fillna("Missing") clean_tz[clean_tz == ""] = "Unknown" tz_counts = clean_tz.value_counts() tz_counts.head() # In[16]: plt.figure(figsize=(10, 4)) # In[17]: import seaborn as sns subset = tz_counts.head() sns.barplot(y=subset.index, x=subset.to_numpy()) # In[18]: frame["a"][1] frame["a"][50] frame["a"][51][:50] # long line # In[19]: results = pd.Series([x.split()[0] for x in frame["a"].dropna()]) results.head(5) results.value_counts().head(8) # In[20]: cframe = frame[frame["a"].notna()].copy() # In[21]: cframe["os"] = np.where(cframe["a"].str.contains("Windows"), "Windows", "Not Windows") cframe["os"].head(5) # In[22]: by_tz_os = cframe.groupby(["tz", "os"]) # In[23]: agg_counts = by_tz_os.size().unstack().fillna(0) agg_counts.head() # In[24]: indexer = agg_counts.sum("columns").argsort() indexer.values[:10] # In[25]: count_subset = agg_counts.take(indexer[-10:]) count_subset # In[26]: agg_counts.sum(axis="columns").nlargest(10) # In[27]: plt.figure() # In[28]: count_subset = count_subset.stack() count_subset.name = "total" count_subset = count_subset.reset_index() count_subset.head(10) sns.barplot(x="total", y="tz", hue="os", data=count_subset) # In[29]: def norm_total(group): group["normed_total"] = group["total"] / group["total"].sum() return group results = count_subset.groupby("tz").apply(norm_total) # In[30]: plt.figure() # In[31]: sns.barplot(x="normed_total", y="tz", hue="os", data=results) # In[32]: g = count_subset.groupby("tz") results2 = count_subset["total"] / g["total"].transform("sum") # In[33]: unames = ["user_id", "gender", "age", "occupation", "zip"] users = pd.read_table("datasets/movielens/users.dat", sep="::", header=None, names=unames, engine="python") rnames = ["user_id", "movie_id", "rating", "timestamp"] ratings = pd.read_table("datasets/movielens/ratings.dat", sep="::", header=None, names=rnames, engine="python") mnames = ["movie_id", "title", "genres"] movies = pd.read_table("datasets/movielens/movies.dat", sep="::", header=None, names=mnames, engine="python") # In[34]: users.head(5) ratings.head(5) movies.head(5) ratings # In[35]: data = pd.merge(pd.merge(ratings, users), movies) data data.iloc[0] # In[36]: mean_ratings = data.pivot_table("rating", index="title", columns="gender", aggfunc="mean") mean_ratings.head(5) # In[37]: ratings_by_title = data.groupby("title").size() ratings_by_title.head() active_titles = ratings_by_title.index[ratings_by_title >= 250] active_titles # In[38]: mean_ratings = mean_ratings.loc[active_titles] mean_ratings # In[39]: mean_ratings = mean_ratings.rename(index={"Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)": "Seven Samurai (Shichinin no samurai) (1954)"}) # In[40]: top_female_ratings = mean_ratings.sort_values("F", ascending=False) top_female_ratings.head() # In[41]: mean_ratings["diff"] = mean_ratings["M"] - mean_ratings["F"] # In[42]: sorted_by_diff = mean_ratings.sort_values("diff") sorted_by_diff.head() # In[43]: sorted_by_diff[::-1].head() # In[44]: rating_std_by_title = data.groupby("title")["rating"].std() rating_std_by_title = rating_std_by_title.loc[active_titles] rating_std_by_title.head() # In[45]: rating_std_by_title.sort_values(ascending=False)[:10] # In[46]: movies["genres"].head() movies["genres"].head().str.split("|") movies["genre"] = movies.pop("genres").str.split("|") movies.head() # In[47]: movies_exploded = movies.explode("genre") movies_exploded[:10] # In[48]: ratings_with_genre = pd.merge(pd.merge(movies_exploded, ratings), users) ratings_with_genre.iloc[0] genre_ratings = (ratings_with_genre.groupby(["genre", "age"]) ["rating"].mean() .unstack("age")) genre_ratings[:10] # In[49]: get_ipython().system('head -n 10 datasets/babynames/yob1880.txt') # In[50]: names1880 = pd.read_csv("datasets/babynames/yob1880.txt", names=["name", "sex", "births"]) names1880 # In[51]: names1880.groupby("sex")["births"].sum() # In[52]: pieces = [] for year in range(1880, 2011): path = f"datasets/babynames/yob{year}.txt" frame = pd.read_csv(path, names=["name", "sex", "births"]) # Add a column for the year frame["year"] = year pieces.append(frame) # Concatenate everything into a single DataFrame names = pd.concat(pieces, ignore_index=True) # In[53]: names # In[54]: total_births = names.pivot_table("births", index="year", columns="sex", aggfunc=sum) total_births.tail() total_births.plot(title="Total births by sex and year") # In[55]: def add_prop(group): group["prop"] = group["births"] / group["births"].sum() return group names = names.groupby(["year", "sex"], group_keys=False).apply(add_prop) # In[56]: names # In[57]: names.groupby(["year", "sex"])["prop"].sum() # In[58]: def get_top1000(group): return group.sort_values("births", ascending=False)[:1000] grouped = names.groupby(["year", "sex"]) top1000 = grouped.apply(get_top1000) top1000.head() # In[59]: top1000 = top1000.reset_index(drop=True) # In[60]: top1000.head() # In[61]: boys = top1000[top1000["sex"] == "M"] girls = top1000[top1000["sex"] == "F"] # In[62]: total_births = top1000.pivot_table("births", index="year", columns="name", aggfunc=sum) # In[63]: total_births.info() subset = total_births[["John", "Harry", "Mary", "Marilyn"]] subset.plot(subplots=True, figsize=(12, 10), title="Number of births per year") # In[64]: plt.figure() # In[65]: table = top1000.pivot_table("prop", index="year", columns="sex", aggfunc=sum) table.plot(title="Sum of table1000.prop by year and sex", yticks=np.linspace(0, 1.2, 13)) # In[66]: df = boys[boys["year"] == 2010] df # In[67]: prop_cumsum = df["prop"].sort_values(ascending=False).cumsum() prop_cumsum[:10] prop_cumsum.searchsorted(0.5) # In[68]: df = boys[boys.year == 1900] in1900 = df.sort_values("prop", ascending=False).prop.cumsum() in1900.searchsorted(0.5) + 1 # In[69]: def get_quantile_count(group, q=0.5): group = group.sort_values("prop", ascending=False) return group.prop.cumsum().searchsorted(q) + 1 diversity = top1000.groupby(["year", "sex"]).apply(get_quantile_count) diversity = diversity.unstack() # In[70]: fig = plt.figure() # In[71]: diversity.head() diversity.plot(title="Number of popular names in top 50%") # In[72]: def get_last_letter(x): return x[-1] last_letters = names["name"].map(get_last_letter) last_letters.name = "last_letter" table = names.pivot_table("births", index=last_letters, columns=["sex", "year"], aggfunc=sum) # In[73]: subtable = table.reindex(columns=[1910, 1960, 2010], level="year") subtable.head() # In[74]: subtable.sum() letter_prop = subtable / subtable.sum() letter_prop # In[75]: import matplotlib.pyplot as plt fig, axes = plt.subplots(2, 1, figsize=(10, 8)) letter_prop["M"].plot(kind="bar", rot=0, ax=axes[0], title="Male") letter_prop["F"].plot(kind="bar", rot=0, ax=axes[1], title="Female", legend=False) # In[76]: plt.subplots_adjust(hspace=0.25) # In[77]: letter_prop = table / table.sum() dny_ts = letter_prop.loc[["d", "n", "y"], "M"].T dny_ts.head() # In[78]: plt.close("all") # In[79]: fig = plt.figure() # In[80]: dny_ts.plot() # In[81]: all_names = pd.Series(top1000["name"].unique()) lesley_like = all_names[all_names.str.contains("Lesl")] lesley_like # In[82]: filtered = top1000[top1000["name"].isin(lesley_like)] filtered.groupby("name")["births"].sum() # In[83]: table = filtered.pivot_table("births", index="year", columns="sex", aggfunc="sum") table = table.div(table.sum(axis="columns"), axis="index") table.tail() # In[84]: fig = plt.figure() # In[85]: table.plot(style={"M": "k-", "F": "k--"}) # In[86]: import json db = json.load(open("datasets/usda_food/database.json")) len(db) # In[87]: db[0].keys() db[0]["nutrients"][0] nutrients = pd.DataFrame(db[0]["nutrients"]) nutrients.head(7) # In[88]: info_keys = ["description", "group", "id", "manufacturer"] info = pd.DataFrame(db, columns=info_keys) info.head() info.info() # In[89]: pd.value_counts(info["group"])[:10] # In[90]: nutrients = [] for rec in db: fnuts = pd.DataFrame(rec["nutrients"]) fnuts["id"] = rec["id"] nutrients.append(fnuts) nutrients = pd.concat(nutrients, ignore_index=True) # In[91]: nutrients # In[92]: nutrients.duplicated().sum() # number of duplicates nutrients = nutrients.drop_duplicates() # In[93]: col_mapping = {"description" : "food", "group" : "fgroup"} info = info.rename(columns=col_mapping, copy=False) info.info() col_mapping = {"description" : "nutrient", "group" : "nutgroup"} nutrients = nutrients.rename(columns=col_mapping, copy=False) nutrients # In[94]: ndata = pd.merge(nutrients, info, on="id") ndata.info() ndata.iloc[30000] # In[95]: fig = plt.figure() # In[96]: result = ndata.groupby(["nutrient", "fgroup"])["value"].quantile(0.5) result["Zinc, Zn"].sort_values().plot(kind="barh") # In[97]: by_nutrient = ndata.groupby(["nutgroup", "nutrient"]) def get_maximum(x): return x.loc[x.value.idxmax()] max_foods = by_nutrient.apply(get_maximum)[["value", "food"]] # make the food a little smaller max_foods["food"] = max_foods["food"].str[:50] # In[98]: max_foods.loc["Amino Acids"]["food"] # In[99]: fec = pd.read_csv("datasets/fec/P00000001-ALL.csv", low_memory=False) fec.info() # In[100]: fec.iloc[123456] # In[101]: unique_cands = fec["cand_nm"].unique() unique_cands unique_cands[2] # In[102]: parties = {"Bachmann, Michelle": "Republican", "Cain, Herman": "Republican", "Gingrich, Newt": "Republican", "Huntsman, Jon": "Republican", "Johnson, Gary Earl": "Republican", "McCotter, Thaddeus G": "Republican", "Obama, Barack": "Democrat", "Paul, Ron": "Republican", "Pawlenty, Timothy": "Republican", "Perry, Rick": "Republican", "Roemer, Charles E. 'Buddy' III": "Republican", "Romney, Mitt": "Republican", "Santorum, Rick": "Republican"} # In[103]: fec["cand_nm"][123456:123461] fec["cand_nm"][123456:123461].map(parties) # Add it as a column fec["party"] = fec["cand_nm"].map(parties) fec["party"].value_counts() # In[104]: (fec["contb_receipt_amt"] > 0).value_counts() # In[105]: fec = fec[fec["contb_receipt_amt"] > 0] # In[106]: fec_mrbo = fec[fec["cand_nm"].isin(["Obama, Barack", "Romney, Mitt"])] # In[107]: fec["contbr_occupation"].value_counts()[:10] # In[108]: occ_mapping = { "INFORMATION REQUESTED PER BEST EFFORTS" : "NOT PROVIDED", "INFORMATION REQUESTED" : "NOT PROVIDED", "INFORMATION REQUESTED (BEST EFFORTS)" : "NOT PROVIDED", "C.E.O.": "CEO" } def get_occ(x): # If no mapping provided, return x return occ_mapping.get(x, x) fec["contbr_occupation"] = fec["contbr_occupation"].map(get_occ) # In[109]: emp_mapping = { "INFORMATION REQUESTED PER BEST EFFORTS" : "NOT PROVIDED", "INFORMATION REQUESTED" : "NOT PROVIDED", "SELF" : "SELF-EMPLOYED", "SELF EMPLOYED" : "SELF-EMPLOYED", } def get_emp(x): # If no mapping provided, return x return emp_mapping.get(x, x) fec["contbr_employer"] = fec["contbr_employer"].map(get_emp) # In[110]: by_occupation = fec.pivot_table("contb_receipt_amt", index="contbr_occupation", columns="party", aggfunc="sum") over_2mm = by_occupation[by_occupation.sum(axis="columns") > 2000000] over_2mm # In[111]: plt.figure() # In[112]: over_2mm.plot(kind="barh") # In[113]: def get_top_amounts(group, key, n=5): totals = group.groupby(key)["contb_receipt_amt"].sum() return totals.nlargest(n) # In[114]: grouped = fec_mrbo.groupby("cand_nm") grouped.apply(get_top_amounts, "contbr_occupation", n=7) grouped.apply(get_top_amounts, "contbr_employer", n=10) # In[115]: bins = np.array([0, 1, 10, 100, 1000, 10000, 100_000, 1_000_000, 10_000_000]) labels = pd.cut(fec_mrbo["contb_receipt_amt"], bins) labels # In[116]: grouped = fec_mrbo.groupby(["cand_nm", labels]) grouped.size().unstack(level=0) # In[117]: plt.figure() # In[118]: bucket_sums = grouped["contb_receipt_amt"].sum().unstack(level=0) normed_sums = bucket_sums.div(bucket_sums.sum(axis="columns"), axis="index") normed_sums normed_sums[:-2].plot(kind="barh") # In[119]: grouped = fec_mrbo.groupby(["cand_nm", "contbr_st"]) totals = grouped["contb_receipt_amt"].sum().unstack(level=0).fillna(0) totals = totals[totals.sum(axis="columns") > 100000] totals.head(10) # In[120]: percent = totals.div(totals.sum(axis="columns"), axis="index") percent.head(10) # In[121]: