from IPython.display import Image, HTML
%load_ext load_style
%load_style talk.css
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
HTML('')
import pandas as pd
pd.set_option("line_width", 80)
# toggle the line below that if one doesnt want DataFrames displayed as HTML tables
#pd.set_option("notebook_repr_html", False)
pd.set_option("notebook_repr_html", True)
HTML('')
a = pd.Series(np.random.normal(0,1,(10,)))
a
b = a
b = b.mean()
b
a
a.index
a.values
a = pd.Series(np.random.normal(0,1,(10,)), index=np.arange(1,11))
a.index
a = pd.Series(np.random.normal(0,1,5), index=['a','b','c','d','e'], name='my series')
a
plot = a.plot(kind='bar', rot=0, color='.8', title=a.name, hatch='///', edgecolor='k')
f, ax = plt.subplots()
bars = ax.bar(np.arange(len(a)), a.values, color='w', edgecolor='k', align='center', hatch='///')
ax.set_xticks(np.arange(len(a)))
ax.set_xlim(-0.5, len(a)-0.5)
ax.set_xticklabels(a.index, fontsize=16)
ax.set_title(a.name, fontsize=16)
a['c']
a['a':'c'] ### Note the difference with standard Python / Numpy positional, integer indexing
a['c':]
a.drop('d')
a.append(pd.Series({'f':5}))
s1 = pd.Series(np.arange(1.0,4.0),index=['a','b','c'])
s2 = pd.Series(np.arange(1.0,4.0),index=['b','c','d'])
s3 = s1 + s2
s3
s3.mean()
s4 = s3.dropna()
s4
s3.fillna(-999)
s3.fillna(s3.mean())
a
a.index = pd.date_range(start='2014-1-1', periods=len(a)) # default 'period' is daily
a.head()
a.index
### a datetime index in Pandas has its own type
a.index
### but you can convert it to an numpy array of python datetime objects if you want
py_datetimes = a.index.to_pydatetime()
py_datetimes
### resample daily time-series to 5 minutes 'period', using forward filling method
a.resample('5min',fill_method='ffill')
a
### the ```shift``` method makes it easy e.g. to compare series with lead / lags
a.shift(periods=-1)
### and the ```truncate`` method allows easy selection of time-slices
a.truncate(after='2014-1-2')
import string # part of the standard library
idx = list(string.lowercase[:10])
print(idx)
df = pd.DataFrame(np.arange(100).reshape(10,10),columns=idx,index=np.arange(1,11))
df
### here I am creating a DataFrame from a dictionnary
df = pd.DataFrame({'A' : np.random.random(5), 'B' : np.random.random(5), 'C': np.random.random(5)})
print df
df
df['A']
df.ix[3]
df.ix[3]['A':'B']
df.ix[3][['A','C']]
df
df['D'] = np.random.random(5)
df
df['E'] = 2.5
df
df['F'] = np.random.random(4)
df['F'] = pd.Series(np.random.random(4)) #
df
df.apply(np.sqrt) # or np.sqrt(df)
df.describe()
df.plot(ylim=[0,3]);
df[(df['A'] >= 0.5) & (df['B'] <= 0.5)]
df.plot(figsize=(10,8), subplots=True, sharex=True, kind='bar', rot=0);
pd.read_
SOI = pd.read_csv('./data/NIWA_SOI.csv')
SOI.head()
SOI.tail()
SOI = pd.read_csv('./data/NIWA_SOI.csv', index_col=0)
SOI.head()
SOI.index
SOI = SOI.dropna()
SOI.head()
SOI.index
SOI.index = np.array(SOI.index.to_native_types(), dtype=np.int)
SOI.index
SOIs = SOI.stack()
SOIs.head()
SOIs.index
from dateutil import parser
dateindex = [parser.parse("-".join(map(str,[x[0],x[1], 1]))) for x in SOIs.index]
dateindex
SOIs.index=dateindex
SOIs.head()
SOIs.plot(figsize=(14,8))
Image(filename='images/split-apply-combine.png', width=800)
url = "ftp://ftp.cpc.ncep.noaa.gov/wd52dg/data/indices/ersst3b.nino.mth.81-10.ascii"
#!wget -P ./data ftp://ftp.cpc.ncep.noaa.gov/wd52dg/data/indices/ersst3b.nino.mth.81-10.ascii
data = pd.read_table(url, sep='\s+')
data.tail()
nino = data[['YR','MON','NINO3.4']]
nino.head()
groups = nino.groupby('MON')
for month, group in groups:
print month
print group.head()
climatology = groups.mean()
climatology['NINO3.4'].head(12)
climatology['NINO3.4'].plot(kind='bar',ylim=[26,28], rot=0)
def zscore(x):
z = (x - x.mean()) / x.std()
return z
transformed = groups.transform(zscore)
transformed['NINO3.4'].plot()
data['ANOM.3'].plot()
data = pd.read_table('./data/teqc_SUM.dat', sep='\s+', header=None) ## the '\s+' is a regular expression
data.head()
data = pd.read_table('./data/teqc_SUM.dat', header=None, sep='\s+', \
names=['path','year1','m1','d1','t1','year2','m2','d2','t2','nh','nd','c1','c2','c3','c4','c5','c6'])
data.head()
from datetime import datetime
data.index = [datetime(y,m,d) for y,m,d in zip(data.year1+2000,data.m1,data.d1)] # note that we are using list comprehension
data.head()
data.pop('path') # we pop the 'path' variable, ve careful it operates in place
data.head()
data2 = data.sort()
data2.head()
data['c3'].plot(rot=90)
plt.plot(data2.c3.values)
data.columns
data2 = data[data.columns[-6:]]
# same as data2 = data[['c1','c2','c3','c4','c5','c6']] or data2 = data.ix[:,10::]
data2.head()
data2.plot(subplots=True, sharex=True, color='b', title='teqc_SUM', figsize=(14,13), rot=90)
plt.savefig('teqc_SUM.pdf')
f, ax = plt.subplots(1,1, figsize=(7,7))
data.groupby(data['m1']).mean()['c1'].plot(kind='bar', ylim=[24000,26000], color='steelblue',alpha=.7, ax=ax)
ax.set_xticklabels(list('JFMAMJJASOND'), rotation=None, fontsize=15);
f, ax = plt.subplots(1,1, figsize=(7,7))
data.groupby(data.index.month).mean()['c1'].plot(kind='bar',\
ylim=[24000,26000], color='steelblue',alpha=.7, ax=ax)
ax.set_xticklabels(list('JFMAMJJASOND'), rotation=None, fontsize=13);
[f.set_fontsize(13) for f in ax.yaxis.get_ticklabels()] # list comprehension again