#!/usr/bin/env python # coding: utf-8 # # Data Mangling with pandas # Mark Santcroos, Department of Human Genetics, Leiden University Medical Center # # Examples and ideas taken from: [Jupyter Documentation](https://pandas.pydata.org/pandas-docs/stable/10min.html) # # Introduction # ## Powerful Python data analysis toolkit # pandas is a Python package aiming to provide # - **fast** # - **flexible** # - **expressive** # # data structures designed to make working with # # - **relational** # - **labeled** # # data both # - **easy** # - **intuitive** # ## Suitable for data of all sorts # - Tabular data with columns of different data types (as in an SQL table or Excel spreadsheet) # - Ordered and unordered time series data (not necessarily fixed-frequency) # - Arbitrary matrix data with row and column labels (homogeneously typed or heterogeneous) # - Any other form of observational / statistical data sets (the data need not be labeled to be placed into a pandas data structure) # ## Primary data structures # - Series (1-dimensional) # - DataFrame (2-dimensional) # # For R users, DataFrame provides everything that R’s data.frame provides and much more. # # pandas is built on top of NumPy and is intended to integrate well within a scientific computing environment with many other 3rd party libraries. # # Getting started # In[1]: import pandas as pd import numpy as np # In[2]: pd.__version__ # # Object creation # ## Series # In[3]: # Create Series with missing data s = pd.Series([1,3,5,np.nan,6,8]) s # ## DataFrame # In[4]: # Create DatetimeIndex for 6 days dates = pd.date_range('20170901', periods=6) dates # In[5]: # Create 6x4 NP array with random values ran_values = np.random.randn(6,4) ran_values # In[6]: df = pd.DataFrame(ran_values, columns=list('ABDC')) # In[7]: df # In[8]: df.set_index(dates, inplace=True) # In[9]: df # In[10]: # Create DataFrame by using a dict of series-like objects. df2 = pd.DataFrame({ 'A' : 1., 'B' : pd.Timestamp('20170920'), 'C' : pd.Series(1, index=list(range(4)), dtype='float32'), 'D' : np.array([3] * 4, dtype='int32'), 'E' : pd.Categorical(["LUMC","EMC","LUMC","EMC"]), 'F' : 'researcher' }) df2 # In[11]: df2.dtypes # # Tab completion # In[12]: # df. shows attributes and column names # # Exploring data # ## Top (head) and bottom (tail) of data # In[13]: df.head() # In[14]: df.tail(2) # ## Meta data # In[15]: df.index # In[16]: df.columns # In[17]: df.values # ## Basic statistics # In[18]: df.describe() # ## Transposing data # In[19]: df.T # ## Sorting # In[20]: # Sort on axis df.sort_index(axis=1) # In[21]: # Sort by value df.sort_values(by='B') # # Data Selection # ## Label based # In[22]: # Select column, which returns a series df['A'] # In[23]: # Row based df[1:4] # In[24]: # Or index based df['20170902':'20170904'] # In[25]: # Cross section using a label df.loc['2017-09-02'] # In[26]: # Multi access selection based on label df.loc[:,['A','B']] # In[27]: # Multi dimension label slicing df.loc['20170902':'20170904',['B','C']] # In[28]: # Reduced dimension of return object for single rows df.loc['20170902',['A','B']] # In[29]: # Scalar values df.loc['20170902','A'] # ## Position based # The semantics follow closely python and numpy slicing. # In[30]: # Row df.iloc[3] # In[31]: # Multi dimension df.iloc[3:5,2:4] # In[32]: # Select rows only df.iloc[1:3,:] # In[33]: # Select columns only df.iloc[:,1:3] # In[34]: df.iloc[1,1] # ## Boolean indexing # In[35]: # Using a single column’s values to select data. df[df.A > 0] # In[36]: # Selecting values from a DataFrame where a boolean condition is met. df[df < 0] # In[37]: # Create new copy and add extra column df3 = df.copy() df3['E'] = ['one', 'one','two','three','four','three'] df3 # In[38]: # Use isin() filtering df3[df3['E'].isin(['two','four'])] # # Modifying data # In[39]: s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20170902', periods=6)) s1 # In[40]: # Add column F, align by original index df['F'] = s1 df # In[41]: # Setting values by label df.at['20170902','A'] = 0 df # In[42]: # Set value at two dimensional location df.iat[0,1] = 0 df # In[43]: # Setting a column based on a numpy array df.loc[:,'D'] = np.array([5] * len(df)) df # ## Setting with matching rule # In[44]: df2 = df.copy() df2[df2 > 0] = -df2 df2 # # Missing data # # - pandas uses the value np.nan to represent missing data # - It is by default not included in computations. # In[45]: # reindex (copy) a subset of the data and add an empty column E df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E']) # Set E to 1 for first two rows df1.loc[dates[0]:dates[1],'E'] = 1 df1 # In[46]: # Drop all rows that have any unknown values df1.dropna(how='any') # In[47]: # Replace NA with value df1.fillna(value=42) # In[48]: # Show the boolean mask pd.isnull(df1) # # Operations # ## Basic (stat) operators # In[49]: # Mean per column df.mean() # similar to axis=0 # In[50]: # Mean per row df.mean(axis=1) # ## Apply # In[51]: # Create my own function that returns the negated value def my_func(val): return -val # Apply my function to all values df.apply(my_func) # ## Histogramming # In[52]: s = pd.Series(np.random.randint(0, 7, size=10)) s # In[53]: s.value_counts() # ## Concatinating data # In[54]: # Create 10x4 table with random numbers df = pd.DataFrame(np.random.randn(10, 4)) df # In[55]: # Split them into 3 chunks (row-based) chunks = [df[:3], df[3:7], df[7:]] chunks # In[56]: # add them back together pd.concat(chunks) # ## Joining data # Many ways to combine multiple dataframes. # In[57]: left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]}) right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]}) # In[58]: left # In[59]: right # In[60]: pd.merge(left, right, on='key') # In[61]: left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]}) right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]}) # In[62]: left # In[63]: right # In[64]: pd.merge(left, right, on='key') # ## Appending rows # In[65]: # create a 8x4 matrix df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D']) df # In[66]: # extract a row s = df.iloc[3] s # In[67]: # append the extract row at the end # df.append(s, ignore_index=False).reindex() - not working since version 2.0.0 df = pd.concat([df, pd.DataFrame([s])], ignore_index=True) df # ## Grouping # # Value based grouping in order to execute methods on the results. # In[68]: # Create df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'C' : np.random.randn(8), 'D' : np.random.randn(8)}) df # In[69]: df.groupby('A').sum() # In[70]: df.groupby(['A','B']).sum() # ## Reshaping using pivot_table # In[71]: # Create a flat table with duplicated entries df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3, 'B' : ['X', 'Y', 'Z'] * 4, 'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2, 'D' : np.random.randn(12), 'E' : np.random.randn(12)}) df # In[72]: # Create a pivot table using A and B as the index, making C columns, and using D as the values (E is not used) pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C']) # # Further information # - https://pandas.pydata.org/pandas-docs/stable/index.html # - https://stackoverflow.com with pandas tag # In[ ]: