#!/usr/bin/env python # coding: utf-8 # # Pandas # Pandas is an open source, BSD-licensed library providing high-performance, easy-to-use data structures and data analysis tools for the Python programming language. # # Library documentation: http://pandas.pydata.org/ # ### General # In[1]: import numpy as np import pandas as pd import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') # In[2]: # create a series s = pd.Series([1,3,5,np.nan,6,8]) s # In[3]: # create a data frame dates = pd.date_range('20130101',periods=6) df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=list('ABCD')) df # In[4]: # another way to create a data frame df2 = pd.DataFrame( { 'A' : 1., 'B' : pd.Timestamp('20130102'), 'C' : pd.Series(1,index=list(range(4)),dtype='float32'), 'D' : np.array([3] * 4,dtype='int32'), 'E' : 'foo' }) df2 # In[5]: df2.dtypes # In[6]: df.head() # In[7]: df.index # In[8]: df.columns # In[9]: df.values # In[10]: # quick data summary df.describe() # In[11]: df.T # In[12]: # axis 0 is index, axis 1 is columns df.sort_index(axis=1, ascending=False) # In[13]: # can sort by values too df.sort(columns='B') # ### Selection # In[14]: # select a column (yields a series) df['A'] # In[15]: # column names also attached to the object df.A # In[16]: # slicing works df[0:3] # In[17]: df['20130102':'20130104'] # In[18]: # cross-section using a label df.loc[dates[0]] # In[19]: # getting a scalar value df.loc[dates[0], 'A'] # In[20]: # select via position df.iloc[3] # In[21]: df.iloc[3:5,0:2] # In[22]: # column slicing df.iloc[:,1:3] # In[23]: # get a value by index df.iloc[1,1] # In[24]: # boolean indexing df[df.A > 0] # In[25]: df[df > 0] # In[26]: # filtering df3 = df.copy() df3['E'] = ['one', 'one', 'two', 'three', 'four', 'three'] df3[df3['E'].isin(['two', 'four'])] # In[27]: # setting examples df.at[dates[0],'A'] = 0 df.iat[0,1] = 0 df.loc[:, 'D'] = np.array([5] * len(df)) df # In[28]: # dealing with missing data df4 = df.reindex(index=dates[0:4],columns=list(df.columns) + ['E']) df4.loc[dates[0]:dates[1],'E'] = 1 df4 # In[29]: # drop rows with missing data df4.dropna(how='any') # In[30]: # fill missing data df4.fillna(value=5) # In[31]: # boolean mask for nan values pd.isnull(df4) # ### Operations # In[32]: df.mean() # In[33]: # pivot the mean calculation df.mean(1) # In[34]: # aligning objects with different dimensions s = pd.Series([1,3,5,np.nan,6,8],index=dates).shift(2) df.sub(s,axis='index') # In[35]: # applying functions df.apply(np.cumsum) # In[36]: df.apply(lambda x: x.max() - x.min()) # In[37]: # simple count aggregation s = pd.Series(np.random.randint(0,7,size=10)) s.value_counts() # ### Merging / Grouping / Shaping # In[38]: # concatenation df = pd.DataFrame(np.random.randn(10, 4)) pieces = [df[:3], df[3:7], df[7:]] pd.concat(pieces) # In[39]: # SQL-style join left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]}) right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]}) pd.merge(left, right, on='key') # In[40]: # append df = pd.DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D']) s = df.iloc[3] df.append(s, ignore_index=True) # In[41]: df = pd.DataFrame( { 'A' : ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'C' : np.random.randn(8), 'D' : np.random.randn(8) }) df # In[42]: # group by df.groupby('A').sum() # In[43]: # group by multiple columns df.groupby(['A','B']).sum() # In[44]: df = pd.DataFrame( { 'A' : ['one', 'one', 'two', 'three'] * 3, 'B' : ['A', 'B', 'C'] * 4, 'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2, 'D' : np.random.randn(12), 'E' : np.random.randn(12)} ) df # In[45]: # pivot table pd.pivot_table(df, values='D', rows=['A', 'B'], columns=['C']) # ### Time Series # In[46]: # time period resampling rng = pd.date_range('1/1/2012', periods=100, freq='S') ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng) ts.resample('5Min', how='sum') # In[47]: rng = pd.date_range('1/1/2012', periods=5, freq='M') ts = pd.Series(np.random.randn(len(rng)), index=rng) ts # In[48]: ps = ts.to_period() ps.to_timestamp() # ### Plotting # In[49]: # time series plot ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000)) ts = ts.cumsum() ts.plot() # In[50]: # plot with a data frame df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index, columns=['A', 'B', 'C', 'D']) df = df.cumsum() plt.figure(); df.plot(); plt.legend(loc='best') # ### Input / Output # In[51]: # write to a csv file df.to_csv('foo.csv', index=False) # In[52]: # read file back in path = r'C:\Users\John\Documents\IPython Notebooks\foo.csv' newDf = pd.read_csv(path) newDf.head() # In[53]: # remove the file import os os.remove(path) # In[54]: # can also do Excel df.to_excel('foo.xlsx', sheet_name='Sheet1') # In[55]: newDf2 = pd.read_excel('foo.xlsx', 'Sheet1', index_col=None, na_values=['NA']) newDf2.head() # In[56]: os.remove('foo.xlsx')