#!/usr/bin/env python
# coding: utf-8

# # Chapter 12: Data processing and analysis with `pandas`

# Robert Johansson
# 
# Source code listings for [Numerical Python - Scientific Computing and Data Science Applications with Numpy, SciPy and Matplotlib](https://www.apress.com/us/book/9781484242452) (ISBN 978-1-484242-45-2).

# In[1]:


get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt


# In[2]:


import numpy as np


# In[3]:


import pandas as pd


# In[4]:


# pd.set_option('display.mpl_style', 'default')


# In[5]:


import matplotlib as mpl
mpl.style.use('ggplot')


# In[6]:


import seaborn as sns


# ## Series object

# In[7]:


s = pd.Series([909976, 8615246, 2872086, 2273305])


# In[8]:


s


# In[9]:


type(s)


# In[10]:


s.dtype


# In[11]:


s.index


# In[12]:


s.values


# In[13]:


s.index = ["Stockholm", "London", "Rome", "Paris"]


# In[14]:


s.name = "Population"


# In[15]:


s


# In[16]:


s = pd.Series([909976, 8615246, 2872086, 2273305], 
              index=["Stockholm", "London", "Rome", "Paris"], name="Population")


# In[17]:


s["London"]


# In[18]:


s.Stockholm


# In[19]:


s[["Paris", "Rome"]]


# In[20]:


s.median(), s.mean(), s.std()


# In[21]:


s.min(), s.max()


# In[22]:


s.quantile(q=0.25), s.quantile(q=0.5), s.quantile(q=0.75)


# In[23]:


s.describe()


# In[24]:


fig, axes = plt.subplots(1, 4, figsize=(12, 3.5))

s.plot(ax=axes[0], kind='line', title="line")
s.plot(ax=axes[1], kind='bar', title="bar")
s.plot(ax=axes[2], kind='box', title="box")
s.plot(ax=axes[3], kind='pie', title="pie")

fig.tight_layout()
fig.savefig("ch12-series-plot.pdf")
fig.savefig("ch12-series-plot.png")


# ## DataFrame object

# In[25]:


df = pd.DataFrame([[909976, 8615246, 2872086, 2273305],
                   ["Sweden", "United kingdom", "Italy", "France"]])


# In[26]:


df


# In[27]:


df = pd.DataFrame([[909976, "Sweden"],
                   [8615246, "United kingdom"], 
                   [2872086, "Italy"],
                   [2273305, "France"]])


# In[28]:


df


# In[29]:


df.index = ["Stockholm", "London", "Rome", "Paris"]


# In[30]:


df.columns = ["Population", "State"]


# In[31]:


df


# In[32]:


df = pd.DataFrame([[909976, "Sweden"],
                   [8615246, "United kingdom"], 
                   [2872086, "Italy"],
                   [2273305, "France"]],
                  index=["Stockholm", "London", "Rome", "Paris"],
                  columns=["Population", "State"])


# In[33]:


df


# In[34]:


df = pd.DataFrame({"Population": [909976, 8615246, 2872086, 2273305],
                   "State": ["Sweden", "United kingdom", "Italy", "France"]},
                  index=["Stockholm", "London", "Rome", "Paris"])


# In[35]:


df


# In[36]:


df.index


# In[37]:


df.columns


# In[38]:


df.values


# In[39]:


df.Population


# In[40]:


df["Population"]


# In[41]:


type(df.Population)


# In[42]:


df.Population.Stockholm


# In[43]:


type(df.index)


# In[44]:


df.loc["Stockholm"]


# In[45]:


type(df.loc["Stockholm"])


# In[46]:


df.loc[["Paris", "Rome"]]


# In[47]:


df.loc[["Paris", "Rome"], "Population"]


# In[48]:


df.loc["Paris", "Population"]


# In[49]:


df.mean()


# In[50]:


df.info()


# In[51]:


df.dtypes


# In[52]:


df.head()


# In[53]:


get_ipython().system('head -n5 /home/rob/datasets/european_cities.csv')


# ## Larger dataset

# In[54]:


df_pop = pd.read_csv("european_cities.csv")


# In[55]:


df_pop.head()


# In[56]:


df_pop = pd.read_csv("european_cities.csv", delimiter=",", encoding="utf-8", header=0)


# In[57]:


df_pop.info()


# In[58]:


df_pop.head()


# In[59]:


df_pop["NumericPopulation"] = df_pop.Population.apply(lambda x: int(x.replace(",", "")))


# In[60]:


df_pop["State"].values[:3]


# In[61]:


df_pop["State"] = df_pop["State"].apply(lambda x: x.strip())


# In[62]:


df_pop.head()


# In[63]:


df_pop.dtypes


# In[64]:


df_pop2 = df_pop.set_index("City")


# In[65]:


df_pop2 = df_pop2.sort_index()


# In[66]:


df_pop2.head()


# In[67]:


df_pop2.head()


# In[68]:


df_pop3 = df_pop.set_index(["State", "City"]).sort_index(level=0)


# In[69]:


df_pop3.head(7)


# In[70]:


df_pop3.loc["Sweden"]


# In[71]:


df_pop3.loc[("Sweden", "Gothenburg")]


# In[72]:


df_pop.set_index("City").sort_values(["State", "NumericPopulation"], ascending=[False, True]).head()


# In[73]:


city_counts = df_pop.State.value_counts()


# In[74]:


city_counts.name = "# cities in top 105"


# In[75]:


df_pop3 = df_pop[["State", "City", "NumericPopulation"]].set_index(["State", "City"])


# In[76]:


df_pop4 = df_pop3.sum(level="State").sort_values("NumericPopulation", ascending=False)


# In[77]:


df_pop4.head()


# In[78]:


df_pop5 = (df_pop.drop("Rank", axis=1)
                 .groupby("State").sum()
                 .sort_values("NumericPopulation", ascending=False))


# In[79]:


df_pop5.head()


# In[80]:


fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

city_counts.plot(kind='barh', ax=ax1)
ax1.set_xlabel("# cities in top 105")
df_pop5.NumericPopulation.plot(kind='barh', ax=ax2)
ax2.set_xlabel("Total pop. in top 105 cities")

fig.tight_layout()
fig.savefig("ch12-state-city-counts-sum.pdf")


# ## Time series

# ### Basics

# In[81]:


import datetime


# In[82]:


pd.date_range("2015-1-1", periods=31)


# In[83]:


pd.date_range(datetime.datetime(2015, 1, 1), periods=31)


# In[84]:


pd.date_range("2015-1-1 00:00", "2015-1-1 12:00", freq="H")


# In[85]:


ts1 = pd.Series(np.arange(31), index=pd.date_range("2015-1-1", periods=31))


# In[86]:


ts1.head()


# In[87]:


ts1["2015-1-3"]


# In[88]:


ts1.index[2]


# In[89]:


ts1.index[2].year, ts1.index[2].month, ts1.index[2].day


# In[90]:


ts1.index[2].nanosecond


# In[91]:


ts1.index[2].to_pydatetime()


# In[92]:


ts2 = pd.Series(np.random.rand(2), 
                index=[datetime.datetime(2015, 1, 1), datetime.datetime(2015, 2, 1)])


# In[93]:


ts2


# In[94]:


periods = pd.PeriodIndex([pd.Period('2015-01'), pd.Period('2015-02'), pd.Period('2015-03')])


# In[95]:


ts3 = pd.Series(np.random.rand(3), periods)


# In[96]:


ts3


# In[97]:


ts3.index


# In[98]:


ts2.to_period('M')


# In[99]:


pd.date_range("2015-1-1", periods=12, freq="M").to_period()


# ### Temperature time series example

# In[100]:


get_ipython().system('head -n 5 temperature_outdoor_2014.tsv')


# In[101]:


df1 = pd.read_csv('temperature_outdoor_2014.tsv', delimiter="\t", names=["time", "outdoor"])


# In[102]:


df2 = pd.read_csv('temperature_indoor_2014.tsv', delimiter="\t", names=["time", "indoor"])


# In[103]:


df1.head()


# In[104]:


df2.head()


# In[105]:


df1.time = (pd.to_datetime(df1.time.values, unit="s")
              .tz_localize('UTC').tz_convert('Europe/Stockholm'))


# In[106]:


df1 = df1.set_index("time")


# In[107]:


df2.time = (pd.to_datetime(df2.time.values, unit="s")
              .tz_localize('UTC').tz_convert('Europe/Stockholm'))


# In[108]:


df2 = df2.set_index("time")


# In[109]:


df1.head()


# In[110]:


df1.index[0]


# In[111]:


fig, ax = plt.subplots(1, 1, figsize=(12, 4))
df1.plot(ax=ax)
df2.plot(ax=ax)

fig.tight_layout()
fig.savefig("ch12-timeseries-temperature-2014.pdf")


# In[112]:


# select january data


# In[113]:


df1.info()


# In[114]:


df1_jan = df1[(df1.index > "2014-1-1") & (df1.index < "2014-2-1")]


# In[115]:


df1.index < "2014-2-1"


# In[116]:


df1_jan.info()


# In[117]:


df2_jan = df2["2014-1-1":"2014-1-31"]


# In[118]:


fig, ax = plt.subplots(1, 1, figsize=(12, 4))

df1_jan.plot(ax=ax)
df2_jan.plot(ax=ax)

fig.tight_layout()
fig.savefig("ch12-timeseries-selected-month.pdf")


# In[119]:


# group by month


# In[120]:


df1_month = df1.reset_index()


# In[121]:


df1_month["month"] = df1_month.time.apply(lambda x: x.month)


# In[122]:


df1_month.head()


# In[123]:


df1_month = df1_month.groupby("month").aggregate(np.mean)


# In[124]:


df2_month = df2.reset_index()


# In[125]:


df2_month["month"] = df2_month.time.apply(lambda x: x.month)


# In[126]:


df2_month = df2_month.groupby("month").aggregate(np.mean)


# In[127]:


df_month = df1_month.join(df2_month)


# In[128]:


df_month.head(3)


# In[129]:


df_month = pd.concat([df.to_period("M").groupby(level=0).mean() for df in [df1, df2]], axis=1)


# In[130]:


df_month.head(3)


# In[131]:


fig, axes = plt.subplots(1, 2, figsize=(12, 4))

df_month.plot(kind='bar', ax=axes[0])
df_month.plot(kind='box', ax=axes[1])

fig.tight_layout()
fig.savefig("ch12-grouped-by-month.pdf")


# In[132]:


df_month


# In[133]:


# resampling


# In[134]:


df1_hour = df1.resample("H").mean()


# In[135]:


df1_hour.columns = ["outdoor (hourly avg.)"]


# In[136]:


df1_day = df1.resample("D").mean()


# In[137]:


df1_day.columns = ["outdoor (daily avg.)"]


# In[138]:


df1_week = df1.resample("7D").mean()


# In[139]:


df1_week.columns = ["outdoor (weekly avg.)"]


# In[140]:


df1_month = df1.resample("M").mean()


# In[141]:


df1_month.columns = ["outdoor (monthly avg.)"]


# In[142]:


df_diff = (df1.resample("D").mean().outdoor - df2.resample("D").mean().indoor)


# In[143]:


fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 6))

df1_hour.plot(ax=ax1, alpha=0.25)
df1_day.plot(ax=ax1)
df1_week.plot(ax=ax1)
df1_month.plot(ax=ax1)

df_diff.plot(ax=ax2)
ax2.set_title("temperature difference between outdoor and indoor")

fig.tight_layout()
fig.savefig("ch12-timeseries-resampled.pdf")


# In[144]:


pd.concat([df1.resample("5min").mean().rename(columns={"outdoor": 'None'}),
           df1.resample("5min").ffill().rename(columns={"outdoor": 'ffill'}),
           df1.resample("5min").bfill().rename(columns={"outdoor": 'bfill'})], axis=1).head()


# ## Selected day

# In[145]:


df1_dec25 = df1[(df1.index < "2014-9-1") & (df1.index >= "2014-8-1")].resample("D")


# In[146]:


df1_dec25 = df1.loc["2014-12-25"]


# In[147]:


df1_dec25.head(5)


# In[148]:


df2_dec25 = df2.loc["2014-12-25"]


# In[149]:


df2_dec25.head(5)


# In[150]:


df1_dec25.describe().T


# In[151]:


fig, ax = plt.subplots(1, 1, figsize=(12, 4))

df1_dec25.plot(ax=ax)

fig.savefig("ch12-timeseries-selected-month.pdf")


# In[152]:


df1.index


# # Seaborn statistical visualization library

# In[153]:


sns.set(style="darkgrid")


# In[154]:


#sns.set(style="whitegrid")


# In[155]:


df1 = pd.read_csv('temperature_outdoor_2014.tsv', delimiter="\t", names=["time", "outdoor"])
df1.time = pd.to_datetime(df1.time.values, unit="s").tz_localize('UTC').tz_convert('Europe/Stockholm')

df1 = df1.set_index("time").resample("10min").mean()
df2 = pd.read_csv('temperature_indoor_2014.tsv', delimiter="\t", names=["time", "indoor"])
df2.time = pd.to_datetime(df2.time.values, unit="s").tz_localize('UTC').tz_convert('Europe/Stockholm')
df2 = df2.set_index("time").resample("10min").mean()
df_temp = pd.concat([df1, df2], axis=1)


# In[156]:


fig, ax = plt.subplots(1, 1, figsize=(8, 4))
df_temp.resample("D").mean().plot(y=["outdoor", "indoor"], ax=ax)
fig.tight_layout()
fig.savefig("ch12-seaborn-plot.pdf")


# In[157]:


#sns.kdeplot(df_temp["outdoor"].dropna().values, shade=True, cumulative=True);


# In[158]:


sns.distplot(df_temp.to_period("M")["outdoor"]["2014-04"].dropna().values, bins=50);
sns.distplot(df_temp.to_period("M")["indoor"]["2014-04"].dropna().values, bins=50);

plt.savefig("ch12-seaborn-distplot.pdf")


# In[159]:


with sns.axes_style("white"):
    sns.jointplot(df_temp.resample("H").mean()["outdoor"].values,
                  df_temp.resample("H").mean()["indoor"].values, kind="hex");
    
plt.savefig("ch12-seaborn-jointplot.pdf")


# In[160]:


sns.kdeplot(df_temp.resample("H").mean()["outdoor"].dropna().values,
            df_temp.resample("H").mean()["indoor"].dropna().values, shade=False);

plt.savefig("ch12-seaborn-kdeplot.pdf")


# In[161]:


fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 4))

sns.boxplot(df_temp.dropna(), ax=ax1, palette="pastel")
sns.violinplot(df_temp.dropna(), ax=ax2, palette="pastel")

fig.tight_layout()
fig.savefig("ch12-seaborn-boxplot-violinplot.pdf")


# In[162]:


sns.violinplot(x=df_temp.dropna().index.month, y=df_temp.dropna().outdoor, color="skyblue");

plt.savefig("ch12-seaborn-violinplot.pdf")


# In[163]:


df_temp["month"] = df_temp.index.month
df_temp["hour"] = df_temp.index.hour


# In[164]:


df_temp.head()


# In[165]:


table = pd.pivot_table(df_temp, values='outdoor', index=['month'], columns=['hour'], aggfunc=np.mean)


# In[166]:


table


# In[167]:


fig, ax = plt.subplots(1, 1, figsize=(8, 4))
sns.heatmap(table, ax=ax);

fig.tight_layout()
fig.savefig("ch12-seaborn-heatmap.pdf")


# ## Versions

# In[168]:


get_ipython().run_line_magic('reload_ext', 'version_information')


# In[169]:


get_ipython().run_line_magic('version_information', 'numpy, matplotlib, pandas, seaborn')