from IPython.display import Image, HTML %load_ext load_style %load_style talk.css import numpy as np import matplotlib.pyplot as plt %matplotlib inline HTML('') import pandas as pd pd.set_option("line_width", 80) # toggle the line below that if one doesnt want DataFrames displayed as HTML tables #pd.set_option("notebook_repr_html", False) pd.set_option("notebook_repr_html", True) HTML('') a = pd.Series(np.random.normal(0,1,(10,))) a b = a b = b.mean() b a a.index a.values a = pd.Series(np.random.normal(0,1,(10,)), index=np.arange(1,11)) a.index a = pd.Series(np.random.normal(0,1,5), index=['a','b','c','d','e'], name='my series') a plot = a.plot(kind='bar', rot=0, color='.8', title=a.name, hatch='///', edgecolor='k') f, ax = plt.subplots() bars = ax.bar(np.arange(len(a)), a.values, color='w', edgecolor='k', align='center', hatch='///') ax.set_xticks(np.arange(len(a))) ax.set_xlim(-0.5, len(a)-0.5) ax.set_xticklabels(a.index, fontsize=16) ax.set_title(a.name, fontsize=16) a['c'] a['a':'c'] ### Note the difference with standard Python / Numpy positional, integer indexing a['c':] a.drop('d') a.append(pd.Series({'f':5})) s1 = pd.Series(np.arange(1.0,4.0),index=['a','b','c']) s2 = pd.Series(np.arange(1.0,4.0),index=['b','c','d']) s3 = s1 + s2 s3 s3.mean() s4 = s3.dropna() s4 s3.fillna(-999) s3.fillna(s3.mean()) a a.index = pd.date_range(start='2014-1-1', periods=len(a)) # default 'period' is daily a.head() a.index ### a datetime index in Pandas has its own type a.index ### but you can convert it to an numpy array of python datetime objects if you want py_datetimes = a.index.to_pydatetime() py_datetimes ### resample daily time-series to 5 minutes 'period', using forward filling method a.resample('5min',fill_method='ffill') a ### the ```shift``` method makes it easy e.g. to compare series with lead / lags a.shift(periods=-1) ### and the ```truncate`` method allows easy selection of time-slices a.truncate(after='2014-1-2') import string # part of the standard library idx = list(string.lowercase[:10]) print(idx) df = pd.DataFrame(np.arange(100).reshape(10,10),columns=idx,index=np.arange(1,11)) df ### here I am creating a DataFrame from a dictionnary df = pd.DataFrame({'A' : np.random.random(5), 'B' : np.random.random(5), 'C': np.random.random(5)}) print df df df['A'] df.ix[3] df.ix[3]['A':'B'] df.ix[3][['A','C']] df df['D'] = np.random.random(5) df df['E'] = 2.5 df df['F'] = np.random.random(4) df['F'] = pd.Series(np.random.random(4)) # df df.apply(np.sqrt) # or np.sqrt(df) df.describe() df.plot(ylim=[0,3]); df[(df['A'] >= 0.5) & (df['B'] <= 0.5)] df.plot(figsize=(10,8), subplots=True, sharex=True, kind='bar', rot=0); pd.read_ SOI = pd.read_csv('./data/NIWA_SOI.csv') SOI.head() SOI.tail() SOI = pd.read_csv('./data/NIWA_SOI.csv', index_col=0) SOI.head() SOI.index SOI = SOI.dropna() SOI.head() SOI.index SOI.index = np.array(SOI.index.to_native_types(), dtype=np.int) SOI.index SOIs = SOI.stack() SOIs.head() SOIs.index from dateutil import parser dateindex = [parser.parse("-".join(map(str,[x[0],x[1], 1]))) for x in SOIs.index] dateindex SOIs.index=dateindex SOIs.head() SOIs.plot(figsize=(14,8)) Image(filename='images/split-apply-combine.png', width=800) url = "ftp://ftp.cpc.ncep.noaa.gov/wd52dg/data/indices/ersst3b.nino.mth.81-10.ascii" #!wget -P ./data ftp://ftp.cpc.ncep.noaa.gov/wd52dg/data/indices/ersst3b.nino.mth.81-10.ascii data = pd.read_table(url, sep='\s+') data.tail() nino = data[['YR','MON','NINO3.4']] nino.head() groups = nino.groupby('MON') for month, group in groups: print month print group.head() climatology = groups.mean() climatology['NINO3.4'].head(12) climatology['NINO3.4'].plot(kind='bar',ylim=[26,28], rot=0) def zscore(x): z = (x - x.mean()) / x.std() return z transformed = groups.transform(zscore) transformed['NINO3.4'].plot() data['ANOM.3'].plot() data = pd.read_table('./data/teqc_SUM.dat', sep='\s+', header=None) ## the '\s+' is a regular expression data.head() data = pd.read_table('./data/teqc_SUM.dat', header=None, sep='\s+', \ names=['path','year1','m1','d1','t1','year2','m2','d2','t2','nh','nd','c1','c2','c3','c4','c5','c6']) data.head() from datetime import datetime data.index = [datetime(y,m,d) for y,m,d in zip(data.year1+2000,data.m1,data.d1)] # note that we are using list comprehension data.head() data.pop('path') # we pop the 'path' variable, ve careful it operates in place data.head() data2 = data.sort() data2.head() data['c3'].plot(rot=90) plt.plot(data2.c3.values) data.columns data2 = data[data.columns[-6:]] # same as data2 = data[['c1','c2','c3','c4','c5','c6']] or data2 = data.ix[:,10::] data2.head() data2.plot(subplots=True, sharex=True, color='b', title='teqc_SUM', figsize=(14,13), rot=90) plt.savefig('teqc_SUM.pdf') f, ax = plt.subplots(1,1, figsize=(7,7)) data.groupby(data['m1']).mean()['c1'].plot(kind='bar', ylim=[24000,26000], color='steelblue',alpha=.7, ax=ax) ax.set_xticklabels(list('JFMAMJJASOND'), rotation=None, fontsize=15); f, ax = plt.subplots(1,1, figsize=(7,7)) data.groupby(data.index.month).mean()['c1'].plot(kind='bar',\ ylim=[24000,26000], color='steelblue',alpha=.7, ax=ax) ax.set_xticklabels(list('JFMAMJJASOND'), rotation=None, fontsize=13); [f.set_fontsize(13) for f in ax.yaxis.get_ticklabels()] # list comprehension again