from __future__ import division import numpy as np import pandas as pd import matplotlib as mpl import matplotlib.pyplot as plt pd.set_option('display.mpl_style', 'default') #IPython magic command for inline plotting %matplotlib inline #a better plot shape for IPython mpl.rcParams['figure.figsize']=[15,3] x = np.linspace(0, 1, 10001) y = np.cos(np.pi/x) * np.exp(-x**2) plt.plot(x, y) plt.show() x=np.linspace(-1, 2, 10001) y = x**2*np.exp(-x) plt.plot(x, y) plt.show() s = pd.Series([1,5,float('NaN'),7.5,2.1,3]) print(s) dates = pd.date_range('20140201', periods=s.size) s.index = dates print(s) letters = ['A', 'B', 'Ch', '#', '#', '---'] s.index = letters print(s) print('\nAccess is like a dictionary key:\ns[\'---\'] = '+str(s['---'])) print('\nRepeat labels are possible:\ns[\'#\']=\n'+str(s['#'])) t = np.exp(s) print(t) s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat']) s.str.upper() s.str.lower() s.str.len() s2 = pd.Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h']) print s2 s2.str.split('_') broken_df = pd.read_csv('2012.csv') #Look at the first 4 rows broken_df[:4] fixed_df = pd.read_csv('2012.csv', index_col='Date') fixed_df[:3] s = pd.Series([1,5,float('NaN'),7.5,2.1,3]) df = pd.DataFrame(s, columns=['x']) print(df) t=np.exp(s) df['exp(x)'] = t df['exp(exp(x))'] = np.exp(t) print(df) print(df['x'], '\n') #column #letters = ['A', 'B', 'Ch', '#', '#', '---'] #df.index=letters #print(df.loc['#'], '\n') #row by label #print(df.iloc[3], '\n') #row by number (note the transposition in output!) print(df[1:4]) #row by slice df1=pd.DataFrame(np.random.randn(dates.size,4),index=dates,columns=list('ABCD')) print df1 df1.sort(columns=list('B')) fixed_df['Berri 1'] fixed_df[fixed_df['Berri 1'] > 1000] from pandas.util.testing import rands df=pd.DataFrame(np.random.randn(dates.size,4),index=dates,columns=list('ABCD')) print df df[df.B>0] df[df > 0] df2 = df.copy() df2['E']=['one', 'one','two','three','four','three'] print df2 df2[df2['E'].isin(['one'])] df.at[dates[0],'A'] = 0 print df df.iat[0,1] = 0 print df from random import randint df = pd.DataFrame({'A': [randint(1, 9) for x in xrange(10)], 'B': [randint(1, 9)*10 for x in xrange(10)], 'C': [randint(1, 9)*100 for x in xrange(10)]}) print df fixed_df['Berri 1'].plot() df= pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD')) print df df1 = df.reindex(index=dates[0:4],columns=list(df.columns) + ['E']) print df1 df1.loc[dates[0]:dates[1],'E'] = 1 print df1 df1.dropna(how='all') #any df1.fillna(value=15) pd.isnull(df1) df2=pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD')) print df2 df2.loc[dates[0]:dates[2],'B']=float('NaN') print df2 print df1+df2 df1['A'].sum() df1.mean(1) df2.cumsum() #Gaussian numbers histogram from numpy.random import normal n = 1000 x = pd.Series(normal(size=n)) #print x avg = x.mean() std = x.std() x_avg = pd.Series(np.ones(n)* avg) x_stdl = pd.Series(np.ones(n)*(avg-std)) x_stdh = pd.Series(np.ones(n)*(avg+std)) df_gauss=pd.DataFrame({'A':x_stdl,'B':x_stdh,'x':x}) df_gauss.plot(style=['rx','rx','bx']) plt.figure() df_gauss['x'].diff().hist(color='g', bins=50) df=pd.DataFrame(np.random.randn(5,5), columns=list('ABCDE')) print df df.apply(lambda x: x.max() - x.min()) #What does lambda do? def f(x): ... return x*2 g = lambda x: x*2 print g(3) from pandas import read_csv from urllib import urlopen page = urlopen("http://econpy.pythonanywhere.com/ex/NFL_1979.csv") df = read_csv(page) print df[:3] df1=df[0:10] print df1 A=df1[:3] B=df1[3:7] C=df1[7:10] print A,B,C parts=[A,B,C] df2=pd.concat(parts) print df2 left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]}) right= pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]}) print left print right pd.merge(left, right, on='key') df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D']) rowadd=df.iloc[3] print rowadd,df df.append(rowadd,ignore_index=True) df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', ....: 'foo', 'bar', 'foo', 'foo'], ....: 'B' : ['one', 'one', 'two', 'three', ....: 'two', 'two', 'one', 'three'], ....: 'C' : np.random.randn(8), ....: 'D' : np.random.randn(8)}) print df df.groupby('A').sum() df.groupby(['A','B']).sum() import statsmodels.formula.api as sm import matplotlib.pyplot as plt url = "http://vincentarelbundock.github.com/Rdatasets/csv/HistData/Guerry.csv" df = pd.read_csv(url) #print df df = df[['Lottery', 'Literacy', 'Wealth', 'Region']].dropna() df.head() mod = sm.ols(formula='Lottery ~ Literacy ', data=df) res = mod.fit() print res.summary() intercept, slope =res.params xtest=np.linspace(1,100,100) ytest=intercept+slope*xtest plt.plot(df['Literacy'],df['Lottery'],'kx') plt.plot(xtest,ytest,'r') plt.show() town1_heights = pd.Series([5, 6, 7, 6, 7.1, 6, 4]) town2_heights = pd.Series([5.5, 6.5, 7, 6, 7.1, 6]) town1_mean = town1_heights.mean() town2_mean = town2_heights.mean() print "Town 1 avg. height", town1_mean print "Town 2 avg. height", town2_mean print "Effect size: ", abs(town1_mean - town2_mean) df=pd.DataFrame({'T1':town1_heights,'T2':town2_heights}) b=df.boxplot() from scipy import stats print "Town 1 Shapiro-Wilks p-value", stats.shapiro(town1_heights)[1] print " T-Test p-value:", stats.ttest_ind(town1_heights, town2_heights,equal_var = False)[1] rng = pd.date_range('1/1/2012', periods=100, freq='S') print rng ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng) ts.plot() ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000)) ts=ts.cumsum() ts.plot() #Bar plot ts = pd.DataFrame(np.random.randn(1000,5), index=pd.date_range('1/1/2000', periods=1000)) ts=ts.cumsum() print ts.ix[5] ts.ix[5].plot(kind='bar'); plt.axhline(0, color='k') url_template = "http://climate.weather.gc.ca/climateData/bulkdata_e.html?format=csv&stationID=5415&Year={year}&Month={month}&timeframe=1&submit=Download+Data" url = url_template.format(month=3, year=2012) weather_mar2012 = pd.read_csv(url, skiprows=16, index_col='Date/Time', parse_dates=True, encoding='latin1') weather_mar2012 weather_mar2012.columns = [ u'Year', u'Month', u'Day', u'Time', u'Data Quality', u'Temp (C)', u'Temp Flag', u'Dew Point Temp (C)', u'Dew Point Temp Flag', u'Rel Hum (%)', u'Rel Hum Flag', u'Wind Dir (10s deg)', u'Wind Dir Flag', u'Wind Spd (km/h)', u'Wind Spd Flag', u'Visibility (km)', u'Visibility Flag', u'Stn Press (kPa)', u'Stn Press Flag', u'Hmdx', u'Hmdx Flag', u'Wind Chill', u'Wind Chill Flag', u'Weather'] weather_mar2012[u'Temp (C)'].plot(figsize=(15, 5)) weather_mar2012 = weather_mar2012.dropna(axis=1, how='any') weather_mar2012 ##Pandas cookbook def download_weather_month(year, month): if month == 1: year += 1 url = url_template.format(year=year, month=month) weather_data = pd.read_csv(url, skiprows=16, index_col='Date/Time', parse_dates=True) weather_data = weather_data.dropna(axis=1) weather_data.columns = [col.replace('\xb0', '') for col in weather_data.columns] weather_data = weather_data.drop(['Year', 'Day', 'Month', 'Time', 'Data Quality'], axis=1) return weather_data data_by_month = [download_weather_month(2012, i) for i in range(1, 13)] #Saving to a csv weather_2012 = pd.concat(data_by_month) weather_2012.to_csv('weather_2012.csv') weather_description = weather_2012['Weather'] is_snowing = weather_description.str.contains('Snow') is_snowing.plot() weather_2012['Temp (C)'].resample('M', how=np.median).plot(kind='bar') is_snowing.astype(float).resample('M', how=np.mean) is_snowing.astype(float).resample('M', how=np.mean).plot(kind='bar') df = pd.DataFrame(np.random.rand(10, 4), columns=['a', 'b', 'c', 'd']) print df #df.plot(kind='bar') #df.plot(kind='bar', stacked=True) #df.plot(kind='barh', stacked=True) #print pd.__version__ from pandas.tools.plotting import scatter_matrix df = pd.DataFrame(np.random.randn(100, 4), columns=['a', 'b', 'c', 'd']) scatter_matrix(df, figsize=(7, 7), diagonal='kde') from pandas import read_csv from urllib import urlopen from pandas.tools.plotting import andrews_curves page = urlopen("https://raw.githubusercontent.com/pydata/pandas/master/pandas/tests/data/iris.csv") df = read_csv(page) andrews_curves(df, 'Name') from pandas.tools.plotting import parallel_coordinates #parallel_coordinates(df,'Name') from pandas.tools.plotting import lag_plot data = pd.Series(0.1 * np.random.rand(1000) + 0.9 * np.sin(np.linspace(-99 * np.pi, 99 * np.pi, num=1000))) lag_plot(data)