import pandas as pd import pylab as pl import numpy as np import re np.sum ?pd.read_csv ! head -n 2 ./data/credit-training.csv df = pd.read_csv("./data/credit-training.csv") #barf!!! df #you'll learn more about head() later df.head() type(df) print df.SeriousDlqin2yrs.head() type(df.SeriousDlqin2yrs) df.dtypes df['DebtRatio'] df.DebtRatio df.head() df.head(1) df.SeriousDlqin2yrs.head() df.tail() df.RevolvingUtilizationOfUnsecuredLines.tail() df.describe() df.age.describe(percentile_width=25) df.NumberOfDependents.unique() df.NumberOfDependents.nunique() pd.value_counts(df.NumberOfDependents) df.NumberOfDependents.value_counts() pd.value_counts(df.NumberOfDependents, ascending=True) pd.value_counts(df.NumberOfDependents, sort=False) #chain value_counts together with head() to give you the top 3 pd.value_counts(df.NumberOfDependents).head(3) pd.value_counts(df.NumberOfDependents).plot(kind='bar') pd.crosstab(df.NumberOfTimes90DaysLate, df.SeriousDlqin2yrs) pd.crosstab(df.age, df.NumberOfDependents) %load https://gist.github.com/glamp/6529725/raw/e38ffd2fc4cb840be21098486ffe5df991946736/camel_to_snake.py def camel_to_snake(column_name): """ converts a string that is camelCase into snake_case Example: print camel_to_snake("javaLovesCamelCase") > java_loves_camel_case See Also: http://stackoverflow.com/questions/1175208/elegant-python-function-to-convert-camelcase-to-camel-case """ s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', column_name) return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower() camel_to_snake("javaLovesCamelCase") df.columns = [camel_to_snake(col) for col in df.columns] df.columns.tolist() df['monthly_income'].head() df.monthly_income.head() df[['monthly_income', 'serious_dlqin2yrs']].head() columns_i_want = ['monthly_income', 'serious_dlqin2yrs'] df[columns_i_want].head() df.newcolumn = 1 # this will throw an error df['newcolumn'] df['one'] = 1 df.one.head() del df['one'] df.monthly_income > 5000 gt_5k = df[df.monthly_income > 5000] print len(gt_5k),"people with monthly_income > 5000" df[df.monthly_income > 5000].head() df.ix[40:45, 0:2] mask = (df.monthly_income > 5000) & (df.serious_dlqin2yrs==1) df[mask].head() mask = (df.age >= 35) & (df.serious_dlqin2yrs==0) & (df.number_of_open_credit_lines_and_loans < 10) len(df[mask])==76151 mask = (df.monthly_income == df.monthly_income.quantile(0.90)) & (df.serious_dlqin2yrs==0) len(df[mask]) ?pd.melt # By not specifying id_vars, we're going to melt EVERYTHING df_lng = pd.melt(df) # now our data is a series of (key, value) rows. #think of when you've done this in Excel so that you can #create a pivot table df_lng.head() null_variables = df_lng.value.isnull() null_variables.sum() # crosstab creates a frequency table between 2 variables # it's going to automatically enumerate the possibilities between # the two Series and show you a count of occurrences #in each possible bucket pd.crosstab(df_lng.variable, null_variables) # let's abstract that code into a function so we can easily # recalculate it def print_null_freq(df): """ for a given DataFrame, calculates how many values for each variable is null and prints the resulting table to stdout """ df_lng = pd.melt(df) null_variables = df_lng.value.isnull() return pd.crosstab(df_lng.variable, null_variables) print_null_freq(df) melted = pd.melt(..., id_vars=[...], value_vars=[...]) print len(melted)==300000 print melted.variable.unique()==np.array(['age', 'debt_ratio']) s = pd.Series([1, 2, None, 4]) s s.fillna(3) s.ffill() s.bfill() s.fillna(s.mean()) df.number_of_dependents = df.number_of_dependents.fillna(0) # proof that the number_of_dependents no longer contains nulls print_null_freq(df) df.monthly_income.describe() df.to_csv("./data/credit-data-post-import.csv", index=False)