import pandas as pd
import pylab as pl
import numpy as np
import re

np.sum

?pd.read_csv

! head -n 2 ./data/credit-training.csv

df = pd.read_csv("./data/credit-training.csv")

#barf!!!
df

#you'll learn more about head() later
df.head()

type(df)

print df.SeriousDlqin2yrs.head()
type(df.SeriousDlqin2yrs)

df.dtypes

df['DebtRatio']
df.DebtRatio

df.head()

df.head(1)

df.SeriousDlqin2yrs.head()

df.tail()

df.RevolvingUtilizationOfUnsecuredLines.tail()

df.describe()

df.age.describe(percentile_width=25)

df.NumberOfDependents.unique()

df.NumberOfDependents.nunique()

pd.value_counts(df.NumberOfDependents)
df.NumberOfDependents.value_counts()

pd.value_counts(df.NumberOfDependents, ascending=True)

pd.value_counts(df.NumberOfDependents, sort=False)

#chain value_counts together with head() to give you the top 3
pd.value_counts(df.NumberOfDependents).head(3)

pd.value_counts(df.NumberOfDependents).plot(kind='bar')

pd.crosstab(df.NumberOfTimes90DaysLate, df.SeriousDlqin2yrs)

pd.crosstab(df.age, df.NumberOfDependents)

%load https://gist.github.com/glamp/6529725/raw/e38ffd2fc4cb840be21098486ffe5df991946736/camel_to_snake.py

def camel_to_snake(column_name):
    """
    converts a string that is camelCase into snake_case
    Example:
        print camel_to_snake("javaLovesCamelCase")
        > java_loves_camel_case
    See Also:
        http://stackoverflow.com/questions/1175208/elegant-python-function-to-convert-camelcase-to-camel-case
    """
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', column_name)
    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()

camel_to_snake("javaLovesCamelCase")

df.columns = [camel_to_snake(col) for col in df.columns]
df.columns.tolist()

df['monthly_income'].head()
df.monthly_income.head()

df[['monthly_income', 'serious_dlqin2yrs']].head()

columns_i_want = ['monthly_income', 'serious_dlqin2yrs']
df[columns_i_want].head()

df.newcolumn = 1
# this will throw an error
df['newcolumn']

df['one'] = 1
df.one.head()

del df['one']

df.monthly_income > 5000

gt_5k = df[df.monthly_income > 5000]
print len(gt_5k),"people with monthly_income > 5000"
df[df.monthly_income > 5000].head()

df.ix[40:45, 0:2]

mask = (df.monthly_income > 5000) & (df.serious_dlqin2yrs==1)
df[mask].head()

mask = (df.age >= 35) & (df.serious_dlqin2yrs==0) & (df.number_of_open_credit_lines_and_loans < 10)
len(df[mask])==76151

mask = (df.monthly_income == df.monthly_income.quantile(0.90)) & (df.serious_dlqin2yrs==0)
len(df[mask])

?pd.melt

# By not specifying id_vars, we're going to melt EVERYTHING
df_lng = pd.melt(df)
# now our data is a series of (key, value) rows. 
#think of when you've done this in Excel so that you can
#create a pivot table 
df_lng.head()

null_variables = df_lng.value.isnull()
null_variables.sum()

# crosstab creates a frequency table between 2 variables
# it's going to automatically enumerate the possibilities between
# the two Series and show you a count of occurrences 
#in each possible bucket
pd.crosstab(df_lng.variable, null_variables)

# let's abstract that code into a function so we can easily 
# recalculate it
def print_null_freq(df):
    """
    for a given DataFrame, calculates how many values for 
    each variable is null and prints the resulting table to stdout
    """
    df_lng = pd.melt(df)
    null_variables = df_lng.value.isnull()
    return pd.crosstab(df_lng.variable, null_variables)
print_null_freq(df)

melted = pd.melt(..., id_vars=[...], value_vars=[...])

print len(melted)==300000
print melted.variable.unique()==np.array(['age', 'debt_ratio'])

s = pd.Series([1, 2, None, 4])
s

s.fillna(3)

s.ffill()

s.bfill()

s.fillna(s.mean())

df.number_of_dependents = df.number_of_dependents.fillna(0)
# proof that the number_of_dependents no longer contains nulls
print_null_freq(df)

df.monthly_income.describe()

df.to_csv("./data/credit-data-post-import.csv", index=False)