Strongly based on http://www.kaggle.com/c/titanic-gettingStarted
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
train = pd.read_csv('./Assets/train.csv')
train.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | Gender | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S | 1 |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | 0 |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S | 0 |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35 | 1 | 0 | 113803 | 53.1000 | C123 | S | 0 |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35 | 0 | 0 | 373450 | 8.0500 | NaN | S | 1 |
We now convert strings to integers for classifiers
train['Gender'] = train['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
We have two missing values in Embarked
train['Embarked'][train['Embarked'].isnull()]
61 NaN 829 NaN Name: Embarked, dtype: object
Replace all missing Embarked with the mode of embarked
train['Embarked'][ train['Embarked'].isnull() ] = train['Embarked'].dropna().mode().values
Change Embarked from a Character Value to an int
Ports = list(enumerate(np.unique(train['Embarked'])))
print('Ports = ', Ports)
Ports_dict = { name : i for i, name in Ports }
print('Ports Dict = ', Ports_dict)
train['Embarked'] = train['Embarked'].map( lambda x: Ports_dict[x]).astype(int)
print(train['Embarked'].head())
Ports = [(0, 'C'), (1, 'Q'), (2, 'S')] Ports Dict = {'C': 0, 'S': 2, 'Q': 1} 0 2 1 0 2 2 3 2 4 2 Name: Embarked, dtype: int32
Turns out we have a lot of missing ages!
print(len(train['Age'][train['Age'].isnull()]))
177
Make all missing ages equal to the median age, there are many different ways to do this!
train.loc[train['Age'].isnull(), 'Age'] = train['Age'].dropna().median()
Drop all the columns we don't need
train.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'PassengerId'], axis=1, inplace=True)
train.head()
Survived | Pclass | Age | SibSp | Parch | Fare | Embarked | Gender | |
---|---|---|---|---|---|---|---|---|
0 | 0 | 3 | 22 | 1 | 0 | 7.2500 | 2 | 1 |
1 | 1 | 1 | 38 | 1 | 0 | 71.2833 | 0 | 0 |
2 | 1 | 3 | 26 | 0 | 0 | 7.9250 | 2 | 0 |
3 | 1 | 1 | 35 | 1 | 0 | 53.1000 | 2 | 0 |
4 | 0 | 3 | 35 | 0 | 0 | 8.0500 | 2 | 1 |
Load in the test data and do the same things
test = pd.read_csv('./Assets/test.csv')
test['Gender'] = test['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
test['Embarked'][test['Embarked'].isnull()] = test['Embarked'].dropna().mode().values
test['Embarked'] = test['Embarked'].map( lambda x: Ports_dict[x]).astype(int)
test.loc[test.Age.isnull(), 'Age'] = test['Age'].dropna().median()
median_fare = np.zeros(3)
We have a missing fare that will will fill in with the fare of the respective class
print(len(test['Fare'][test['Fare'].isnull()]))
median_fare = {}
for i in test['Pclass'].unique():
median_fare[i] = test[test['Pclass'] == i+1 ]['Fare'].dropna().median()
for f in test['Pclass'].unique():
test.loc[ (test['Fare'].isnull()) & (test['Pclass'] == f+1 ), 'Fare'] = median_fare[f]
1
We don't want PassengerID to be in our classifier, so lets store it in another variable for now so we can drop the columns we don't need
passenger_ids = test['PassengerId'].values
test.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'PassengerId'], axis=1, inplace=True)
We are now ready to make a Random Forest Classifier, first we load the data into a NumPy Array
train_data = train.values
test_data = test.values
train.head()
Survived | Pclass | Age | SibSp | Parch | Fare | Embarked | Gender | |
---|---|---|---|---|---|---|---|---|
0 | 0 | 3 | 22 | 1 | 0 | 7.2500 | 2 | 1 |
1 | 1 | 1 | 38 | 1 | 0 | 71.2833 | 0 | 0 |
2 | 1 | 3 | 26 | 0 | 0 | 7.9250 | 2 | 0 |
3 | 1 | 1 | 35 | 1 | 0 | 53.1000 | 2 | 0 |
4 | 0 | 3 | 35 | 0 | 0 | 8.0500 | 2 | 1 |
forest = RandomForestClassifier(n_estimators=100)
#We are training all columns against the Survived Column (Column 0)
forest = forest.fit(X=train_data[0::,1::], y=train_data[0::,0] )
Now we simply run the model we have trained against our test set
output = forest.predict(test_data).astype(int)
Create a DataFrame to store the results
test_output = pd.DataFrame(data = passenger_ids, columns=['PassangerId'] )
test_output['Survived'] = output
print(test_output.head())
PassangerId Survived 0 892 0 1 893 0 2 894 0 3 895 1 4 896 0
Write to csv
test_output.to_csv('./assets/myfirstforest.csv')
from sklearn.externals import joblib
joblib.dump(forest, './Assets/my_model.pkl', compress=9)
['./Assets/my_model.pkl']