# import libaries
import numpy as np
import pandas as pd
import matplotlib.pyplot as pyplt
from IPython.display import Image

# Will first not import Seaborn to show how seaborn makes nice plots
# import seaborn as sns

# Plot in ipython notebook
%matplotlib inline

# Nothing less than the latest :-)
!python --version

print('Numpy Version: ', np.__version__)
print('Pandas Version: ', pd.__version__)

# Set Display Options
pd.options.display.max_rows = 15

# Reading Data - Titanic Dataset
# read_csv, read_frame
train_df = pd.read_csv('kaggle_titanic_data/train.csv') 

train_df

# What is a Pandas DataFrame
Image(filename='images/dataframe.png')

train_df.columns

# Describe summary of data
train_df.describe()

# Display Column Types
train_df.dtypes

# Change columns to lowercase
train_df.columns = [col.lower() for col in train_df.columns]

train_df.columns

# Rename sex to gender
train_df.rename(columns={'sex': 'gender'}, inplace=True)

# select a Column
train_df[['gender', 'pclass']]

# select a row
train_df.ix[0:6][['name':'survived']]

# show only that survived
train_df[train_df.survived == 1]

# unique values
train_df.gender.unique()

# How many Survived?
train_df['survived'].value_counts()

# Distribution of Age
ax = train_df['age'].hist()  # bins
ax.set_title('Histogram of Fares')
ax.set_xlabel('x Label')

# Now set Pandas options to make plot pretty
pd.set_option('display.mpl_style', 'default') # Make the graphs a bit prettier (not necessary if you use seaborn)

# Distribution of Age
ax = train_df['age'].hist(bins=30)  # bins
ax.set_title('Histogram of Fares')
ax.set_xlabel('x Label')

import seaborn as sns

# Distribution of Age
ax = train_df['age'].hist(bins=30)  # bins
ax.set_title('Histogram of Fares')
ax.set_xlabel('x Label')

train_df.age[pd.isnull(train_df.age)]

# Age - fix null values
# Method 1
train_df.age[pd.isnull(train_df.age)] = train_df.age.mean()

# Method 2 - Pandas Convenience Functions
train_df.age.fillna(train_df.age.mean())

# Did your gender make a difference in survival
train_df.gender.value_counts()

train_df.groupby(['pclass', 'gender'])['survived'].agg(['sum', 'count'])

Image(filename='images/wesm_book_groupby.png')

# Show how many survivied by gender and class
class_gender_group = train_df.groupby(['pclass', 'gender'])

for k, group in class_gender_group:
    print(k)

# Plot how many survived, by gender and class
grid_plot = sns.FacetGrid(train_df, row='gender', col='pclass')
grid_plot.map(sns.regplot, 'survived', 'age',color='.3', fit_reg=False, x_jitter=.1)

ax = sns.boxplot(train_df.age, groupby=train_df.pclass)
ax.set_title('Age Distribution by class')

# Who paid the highest Fare in Titanic.  Did they survive?
train_df.sort_index(by='fare', ascending=False)

# Highest Paid ticket by Class.
def topn(group, field, n=5):
    return group.sort_index(by=field, ascending=False)[:n]

train_df.groupby('pclass').apply(topn, 'fare', 2)

# Youngenst 2 by Class.
def botm(group, field, n=5):
    return group.sort_index(by=field)[:n]

train_df.groupby('pclass').apply(botm, 'age', 2)

# Write back the changes
# train_df.to_csv('kaggle_titanic_data/train_modified.csv')

# Custom Translation of Values and creating new Columns
def gender_map(val):
    if val == 'male':
        return 1
    return 0
train_df['gender_val'] = train_df.gender.map(gender_map)

train_df

train_df.drop('gender_val', axis=1, inplace=True)

for row in train_df:
    print(row)

# Some mistakes I made when working with Pandas
# Don't use Loop to update Dataframe
for k, row in train_df.iterrows():
    if row.gender == 'male':
        train_df.ix[k, 'gender'] = 1
    else:
        train_df.ix[k, 'gender'] = 0

train_df

# Filtering Gotchas
train_df[((train_df.survived == 1) & (train_df.pclass == 1))]