# import libaries
import numpy as np
import pandas as pd
import matplotlib.pyplot as pyplt
from IPython.display import Image
# Will first not import Seaborn to show how seaborn makes nice plots
# import seaborn as sns
# Plot in ipython notebook
%matplotlib inline
# Nothing less than the latest :-)
!python --version
Python 3.4.1
print('Numpy Version: ', np.__version__)
print('Pandas Version: ', pd.__version__)
Numpy Version: 1.8.1 Pandas Version: 0.14.0
# Set Display Options
pd.options.display.max_rows = 15
# Reading Data - Titanic Dataset
# read_csv, read_frame
train_df = pd.read_csv('kaggle_titanic_data/train.csv')
train_df
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26 | 0 | 0 | 111369 | 30.0000 | C148 | C |
890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32 | 0 | 0 | 370376 | 7.7500 | NaN | Q |
891 rows × 12 columns
# What is a Pandas DataFrame
Image(filename='images/dataframe.png')
train_df.columns
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], dtype='object')
# Describe summary of data
train_df.describe()
PassengerId | Survived | Pclass | Age | SibSp | Parch | Fare | |
---|---|---|---|---|---|---|---|
count | 891.000000 | 891.000000 | 891.000000 | 714.000000 | 891.000000 | 891.000000 | 891.000000 |
mean | 446.000000 | 0.383838 | 2.308642 | 29.699118 | 0.523008 | 0.381594 | 32.204208 |
std | 257.353842 | 0.486592 | 0.836071 | 14.526497 | 1.102743 | 0.806057 | 49.693429 |
min | 1.000000 | 0.000000 | 1.000000 | 0.420000 | 0.000000 | 0.000000 | 0.000000 |
25% | 223.500000 | 0.000000 | 2.000000 | 20.125000 | 0.000000 | 0.000000 | 7.910400 |
50% | 446.000000 | 0.000000 | 3.000000 | 28.000000 | 0.000000 | 0.000000 | 14.454200 |
75% | 668.500000 | 1.000000 | 3.000000 | 38.000000 | 1.000000 | 0.000000 | 31.000000 |
max | 891.000000 | 1.000000 | 3.000000 | 80.000000 | 8.000000 | 6.000000 | 512.329200 |
# Display Column Types
train_df.dtypes
PassengerId int64 Survived int64 Pclass int64 Name object Sex object Age float64 SibSp int64 Parch int64 Ticket object Fare float64 Cabin object Embarked object dtype: object
# Change columns to lowercase
train_df.columns = [col.lower() for col in train_df.columns]
train_df.columns
Index(['passengerid', 'survived', 'pclass', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket', 'fare', 'cabin', 'embarked'], dtype='object')
# Rename sex to gender
train_df.rename(columns={'sex': 'gender'}, inplace=True)
# select a Column
train_df[['gender', 'pclass']]
gender | pclass | |
---|---|---|
0 | male | 3 |
1 | female | 1 |
2 | female | 3 |
3 | female | 1 |
4 | male | 3 |
5 | male | 3 |
6 | male | 1 |
... | ... | ... |
884 | male | 3 |
885 | female | 3 |
886 | male | 2 |
887 | female | 1 |
888 | female | 3 |
889 | male | 1 |
890 | male | 3 |
891 rows × 2 columns
# select a row
train_df.ix[0:6][['name':'survived']]
File "<ipython-input-29-4751a92576db>", line 2 train_df.ix[0:6][['name':'survived']] ^ SyntaxError: invalid syntax
# show only that survived
train_df[train_df.survived == 1]
passengerid | survived | pclass | name | gender | age | sibsp | parch | ticket | fare | cabin | embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35 | 1 | 0 | 113803 | 53.1000 | C123 | S |
8 | 9 | 1 | 3 | Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) | female | 27 | 0 | 2 | 347742 | 11.1333 | NaN | S |
9 | 10 | 1 | 2 | Nasser, Mrs. Nicholas (Adele Achem) | female | 14 | 1 | 0 | 237736 | 30.0708 | NaN | C |
10 | 11 | 1 | 3 | Sandstrom, Miss. Marguerite Rut | female | 4 | 1 | 1 | PP 9549 | 16.7000 | G6 | S |
11 | 12 | 1 | 1 | Bonnell, Miss. Elizabeth | female | 58 | 0 | 0 | 113783 | 26.5500 | C103 | S |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
871 | 872 | 1 | 1 | Beckwith, Mrs. Richard Leonard (Sallie Monypeny) | female | 47 | 1 | 1 | 11751 | 52.5542 | D35 | S |
874 | 875 | 1 | 2 | Abelson, Mrs. Samuel (Hannah Wizosky) | female | 28 | 1 | 0 | P/PP 3381 | 24.0000 | NaN | C |
875 | 876 | 1 | 3 | Najib, Miss. Adele Kiamie "Jane" | female | 15 | 0 | 0 | 2667 | 7.2250 | NaN | C |
879 | 880 | 1 | 1 | Potter, Mrs. Thomas Jr (Lily Alexenia Wilson) | female | 56 | 0 | 1 | 11767 | 83.1583 | C50 | C |
880 | 881 | 1 | 2 | Shelley, Mrs. William (Imanita Parrish Hall) | female | 25 | 0 | 1 | 230433 | 26.0000 | NaN | S |
887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19 | 0 | 0 | 112053 | 30.0000 | B42 | S |
889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26 | 0 | 0 | 111369 | 30.0000 | C148 | C |
342 rows × 12 columns
# unique values
train_df.gender.unique()
array(['male', 'female'], dtype=object)
# How many Survived?
train_df['survived'].value_counts()
0 549 1 342 dtype: int64
# Distribution of Age
ax = train_df['age'].hist() # bins
ax.set_title('Histogram of Fares')
ax.set_xlabel('x Label')
<matplotlib.text.Text at 0x10c74ebe0>
# Now set Pandas options to make plot pretty
pd.set_option('display.mpl_style', 'default') # Make the graphs a bit prettier (not necessary if you use seaborn)
# Distribution of Age
ax = train_df['age'].hist(bins=30) # bins
ax.set_title('Histogram of Fares')
ax.set_xlabel('x Label')
<matplotlib.text.Text at 0x10c8e16d8>
import seaborn as sns
# Distribution of Age
ax = train_df['age'].hist(bins=30) # bins
ax.set_title('Histogram of Fares')
ax.set_xlabel('x Label')
<matplotlib.text.Text at 0x10e913198>
train_df.age[pd.isnull(train_df.age)]
5 NaN 17 NaN 19 NaN 26 NaN 28 NaN ... 849 NaN 859 NaN 863 NaN 868 NaN 878 NaN 888 NaN Name: age, Length: 177, dtype: float64
# Age - fix null values
# Method 1
train_df.age[pd.isnull(train_df.age)] = train_df.age.mean()
# Method 2 - Pandas Convenience Functions
train_df.age.fillna(train_df.age.mean())
0 22 1 38 2 26 3 35 4 35 ... 885 39.000000 886 27.000000 887 19.000000 888 29.699118 889 26.000000 890 32.000000 Name: age, Length: 891, dtype: float64
# Did your gender make a difference in survival
train_df.gender.value_counts()
male 577 female 314 dtype: int64
train_df.groupby(['pclass', 'gender'])['survived'].agg(['sum', 'count'])
sum | count | ||
---|---|---|---|
pclass | gender | ||
1 | female | 91 | 94 |
male | 45 | 122 | |
2 | female | 70 | 76 |
male | 17 | 108 | |
3 | female | 72 | 144 |
male | 47 | 347 |
Image(filename='images/wesm_book_groupby.png')
# Show how many survivied by gender and class
class_gender_group = train_df.groupby(['pclass', 'gender'])
for k, group in class_gender_group:
print(k)
(1, 'female') (1, 'male') (2, 'female') (2, 'male') (3, 'female') (3, 'male')
# Plot how many survived, by gender and class
grid_plot = sns.FacetGrid(train_df, row='gender', col='pclass')
grid_plot.map(sns.regplot, 'survived', 'age',color='.3', fit_reg=False, x_jitter=.1)
<seaborn.axisgrid.FacetGrid at 0x10e9d8240>
ax = sns.boxplot(train_df.age, groupby=train_df.pclass)
ax.set_title('Age Distribution by class')
<matplotlib.text.Text at 0x10f421208>
# Who paid the highest Fare in Titanic. Did they survive?
train_df.sort_index(by='fare', ascending=False)
passengerid | survived | pclass | name | gender | age | sibsp | parch | ticket | fare | cabin | embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
258 | 259 | 1 | 1 | Ward, Miss. Anna | female | 35 | 0 | 0 | PC 17755 | 512.3292 | NaN | C |
737 | 738 | 1 | 1 | Lesurer, Mr. Gustave J | male | 35 | 0 | 0 | PC 17755 | 512.3292 | B101 | C |
679 | 680 | 1 | 1 | Cardeza, Mr. Thomas Drake Martinez | male | 36 | 0 | 1 | PC 17755 | 512.3292 | B51 B53 B55 | C |
88 | 89 | 1 | 1 | Fortune, Miss. Mabel Helen | female | 23 | 3 | 2 | 19950 | 263.0000 | C23 C25 C27 | S |
27 | 28 | 0 | 1 | Fortune, Mr. Charles Alexander | male | 19 | 3 | 2 | 19950 | 263.0000 | C23 C25 C27 | S |
# Highest Paid ticket by Class.
def topn(group, field, n=5):
return group.sort_index(by=field, ascending=False)[:n]
train_df.groupby('pclass').apply(topn, 'fare', 2)
passengerid | survived | pclass | name | gender | age | sibsp | parch | ticket | fare | cabin | embarked | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
pclass | |||||||||||||
1 | 258 | 259 | 1 | 1 | Ward, Miss. Anna | female | 35.000000 | 0 | 0 | PC 17755 | 512.3292 | NaN | C |
737 | 738 | 1 | 1 | Lesurer, Mr. Gustave J | male | 35.000000 | 0 | 0 | PC 17755 | 512.3292 | B101 | C | |
2 | 665 | 666 | 0 | 2 | Hickman, Mr. Lewis | male | 32.000000 | 2 | 0 | S.O.C. 14879 | 73.5000 | NaN | S |
72 | 73 | 0 | 2 | Hood, Mr. Ambrose Jr | male | 21.000000 | 0 | 0 | S.O.C. 14879 | 73.5000 | NaN | S | |
3 | 324 | 325 | 0 | 3 | Sage, Mr. George John Jr | male | 29.699118 | 8 | 2 | CA. 2343 | 69.5500 | NaN | S |
792 | 793 | 0 | 3 | Sage, Miss. Stella Anna | female | 29.699118 | 8 | 2 | CA. 2343 | 69.5500 | NaN | S |
# Youngenst 2 by Class.
def botm(group, field, n=5):
return group.sort_index(by=field)[:n]
train_df.groupby('pclass').apply(botm, 'age', 2)
passengerid | survived | pclass | name | gender | age | sibsp | parch | ticket | fare | cabin | embarked | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
pclass | |||||||||||||
1 | 305 | 306 | 1 | 1 | Allison, Master. Hudson Trevor | male | 0.92 | 1 | 2 | 113781 | 151.5500 | C22 C26 | S |
297 | 298 | 0 | 1 | Allison, Miss. Helen Loraine | female | 2.00 | 1 | 2 | 113781 | 151.5500 | C22 C26 | S | |
2 | 755 | 756 | 1 | 2 | Hamalainen, Master. Viljo | male | 0.67 | 1 | 1 | 250649 | 14.5000 | NaN | S |
78 | 79 | 1 | 2 | Caldwell, Master. Alden Gates | male | 0.83 | 0 | 2 | 248738 | 29.0000 | NaN | S | |
3 | 803 | 804 | 1 | 3 | Thomas, Master. Assad Alexander | male | 0.42 | 0 | 1 | 2625 | 8.5167 | NaN | C |
469 | 470 | 1 | 3 | Baclini, Miss. Helene Barbara | female | 0.75 | 2 | 1 | 2666 | 19.2583 | NaN | C |
# Write back the changes
# train_df.to_csv('kaggle_titanic_data/train_modified.csv')
# Custom Translation of Values and creating new Columns
def gender_map(val):
if val == 'male':
return 1
return 0
train_df['gender_val'] = train_df.gender.map(gender_map)
train_df
passengerid | survived | pclass | name | gender | age | sibsp | parch | ticket | fare | cabin | embarked | gender_val | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.000000 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S | 1 |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.000000 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | 0 |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.000000 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S | 0 |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.000000 | 1 | 0 | 113803 | 53.1000 | C123 | S | 0 |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.000000 | 0 | 0 | 373450 | 8.0500 | NaN | S | 1 |
5 | 6 | 0 | 3 | Moran, Mr. James | male | 29.699118 | 0 | 0 | 330877 | 8.4583 | NaN | Q | 1 |
6 | 7 | 0 | 1 | McCarthy, Mr. Timothy J | male | 54.000000 | 0 | 0 | 17463 | 51.8625 | E46 | S | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
884 | 885 | 0 | 3 | Sutehall, Mr. Henry Jr | male | 25.000000 | 0 | 0 | SOTON/OQ 392076 | 7.0500 | NaN | S | 1 |
885 | 886 | 0 | 3 | Rice, Mrs. William (Margaret Norton) | female | 39.000000 | 0 | 5 | 382652 | 29.1250 | NaN | Q | 0 |
886 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.000000 | 0 | 0 | 211536 | 13.0000 | NaN | S | 1 |
887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.000000 | 0 | 0 | 112053 | 30.0000 | B42 | S | 0 |
888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | 29.699118 | 1 | 2 | W./C. 6607 | 23.4500 | NaN | S | 0 |
889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.000000 | 0 | 0 | 111369 | 30.0000 | C148 | C | 1 |
890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.000000 | 0 | 0 | 370376 | 7.7500 | NaN | Q | 1 |
891 rows × 13 columns
train_df.drop('gender_val', axis=1, inplace=True)
for row in train_df:
print(row)
passengerid survived pclass name gender age sibsp parch ticket fare cabin embarked
# Some mistakes I made when working with Pandas
# Don't use Loop to update Dataframe
for k, row in train_df.iterrows():
if row.gender == 'male':
train_df.ix[k, 'gender'] = 1
else:
train_df.ix[k, 'gender'] = 0
train_df
# Filtering Gotchas
train_df[((train_df.survived == 1) & (train_df.pclass == 1))]
passengerid | survived | pclass | name | gender | age | sibsp | parch | ticket | fare | cabin | embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | 0 | 38.000000 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | 0 | 35.000000 | 1 | 0 | 113803 | 53.1000 | C123 | S |
11 | 12 | 1 | 1 | Bonnell, Miss. Elizabeth | 0 | 58.000000 | 0 | 0 | 113783 | 26.5500 | C103 | S |
23 | 24 | 1 | 1 | Sloper, Mr. William Thompson | 1 | 28.000000 | 0 | 0 | 113788 | 35.5000 | A6 | S |
31 | 32 | 1 | 1 | Spencer, Mrs. William Augustus (Marie Eugenie) | 0 | 29.699118 | 1 | 0 | PC 17569 | 146.5208 | B78 | C |
52 | 53 | 1 | 1 | Harper, Mrs. Henry Sleeper (Myna Haxtun) | 0 | 49.000000 | 1 | 0 | PC 17572 | 76.7292 | D33 | C |
55 | 56 | 1 | 1 | Woolner, Mr. Hugh | 1 | 29.699118 | 0 | 0 | 19947 | 35.5000 | C52 | S |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
856 | 857 | 1 | 1 | Wick, Mrs. George Dennick (Mary Hitchcock) | 0 | 45.000000 | 1 | 1 | 36928 | 164.8667 | NaN | S |
857 | 858 | 1 | 1 | Daly, Mr. Peter Denis | 1 | 51.000000 | 0 | 0 | 113055 | 26.5500 | E17 | S |
862 | 863 | 1 | 1 | Swift, Mrs. Frederick Joel (Margaret Welles Ba... | 0 | 48.000000 | 0 | 0 | 17466 | 25.9292 | D17 | S |
871 | 872 | 1 | 1 | Beckwith, Mrs. Richard Leonard (Sallie Monypeny) | 0 | 47.000000 | 1 | 1 | 11751 | 52.5542 | D35 | S |
879 | 880 | 1 | 1 | Potter, Mrs. Thomas Jr (Lily Alexenia Wilson) | 0 | 56.000000 | 0 | 1 | 11767 | 83.1583 | C50 | C |
887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | 0 | 19.000000 | 0 | 0 | 112053 | 30.0000 | B42 | S |
889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | 1 | 26.000000 | 0 | 0 | 111369 | 30.0000 | C148 | C |
136 rows × 12 columns