# import libaries import numpy as np import pandas as pd import matplotlib.pyplot as pyplt from IPython.display import Image # Will first not import Seaborn to show how seaborn makes nice plots # import seaborn as sns # Plot in ipython notebook %matplotlib inline # Nothing less than the latest :-) !python --version print('Numpy Version: ', np.__version__) print('Pandas Version: ', pd.__version__) # Set Display Options pd.options.display.max_rows = 15 # Reading Data - Titanic Dataset # read_csv, read_frame train_df = pd.read_csv('kaggle_titanic_data/train.csv') train_df # What is a Pandas DataFrame Image(filename='images/dataframe.png') train_df.columns # Describe summary of data train_df.describe() # Display Column Types train_df.dtypes # Change columns to lowercase train_df.columns = [col.lower() for col in train_df.columns] train_df.columns # Rename sex to gender train_df.rename(columns={'sex': 'gender'}, inplace=True) # select a Column train_df[['gender', 'pclass']] # select a row train_df.ix[0:6][['name':'survived']] # show only that survived train_df[train_df.survived == 1] # unique values train_df.gender.unique() # How many Survived? train_df['survived'].value_counts() # Distribution of Age ax = train_df['age'].hist() # bins ax.set_title('Histogram of Fares') ax.set_xlabel('x Label') # Now set Pandas options to make plot pretty pd.set_option('display.mpl_style', 'default') # Make the graphs a bit prettier (not necessary if you use seaborn) # Distribution of Age ax = train_df['age'].hist(bins=30) # bins ax.set_title('Histogram of Fares') ax.set_xlabel('x Label') import seaborn as sns # Distribution of Age ax = train_df['age'].hist(bins=30) # bins ax.set_title('Histogram of Fares') ax.set_xlabel('x Label') train_df.age[pd.isnull(train_df.age)] # Age - fix null values # Method 1 train_df.age[pd.isnull(train_df.age)] = train_df.age.mean() # Method 2 - Pandas Convenience Functions train_df.age.fillna(train_df.age.mean()) # Did your gender make a difference in survival train_df.gender.value_counts() train_df.groupby(['pclass', 'gender'])['survived'].agg(['sum', 'count']) Image(filename='images/wesm_book_groupby.png') # Show how many survivied by gender and class class_gender_group = train_df.groupby(['pclass', 'gender']) for k, group in class_gender_group: print(k) # Plot how many survived, by gender and class grid_plot = sns.FacetGrid(train_df, row='gender', col='pclass') grid_plot.map(sns.regplot, 'survived', 'age',color='.3', fit_reg=False, x_jitter=.1) ax = sns.boxplot(train_df.age, groupby=train_df.pclass) ax.set_title('Age Distribution by class') # Who paid the highest Fare in Titanic. Did they survive? train_df.sort_index(by='fare', ascending=False) # Highest Paid ticket by Class. def topn(group, field, n=5): return group.sort_index(by=field, ascending=False)[:n] train_df.groupby('pclass').apply(topn, 'fare', 2) # Youngenst 2 by Class. def botm(group, field, n=5): return group.sort_index(by=field)[:n] train_df.groupby('pclass').apply(botm, 'age', 2) # Write back the changes # train_df.to_csv('kaggle_titanic_data/train_modified.csv') # Custom Translation of Values and creating new Columns def gender_map(val): if val == 'male': return 1 return 0 train_df['gender_val'] = train_df.gender.map(gender_map) train_df train_df.drop('gender_val', axis=1, inplace=True) for row in train_df: print(row) # Some mistakes I made when working with Pandas # Don't use Loop to update Dataframe for k, row in train_df.iterrows(): if row.gender == 'male': train_df.ix[k, 'gender'] = 1 else: train_df.ix[k, 'gender'] = 0 train_df # Filtering Gotchas train_df[((train_df.survived == 1) & (train_df.pclass == 1))]