import pandas as pd import numpy as np import pylab as pl df = pd.read_csv("./data/credit-data-trainingset.csv") df.head() df.monthly_income.apply(np.log) add_10 = lambda x: x + 10 plus = lambda x, y: x + y print add_10(9) print plus(10, 20) df.monthly_income.apply(lambda x: np.log(x + 1)) #numpy actually has log(x + 1) help(np.log1p) def inverse(x): return 1 / (x + 1) df.monthly_income.apply(inverse) def cap_value(x, cap): """ x - a value cap - threshold value for x; if x > cap, then x is set to cap Examples: cap_value(1000, 10) 10 cap_value(10, 100) 10 """ # your code here return None print cap_value(1000, 10)==10 print cap_value(10, 100)==10 print df.debt_ratio.apply(lambda x: cap_value(x, 5.0)).mean()#should be close to 1.28 subset = df[['serious_dlqin2yrs', 'age', 'monthly_income']] subset.groupby("serious_dlqin2yrs") subset.groupby("serious_dlqin2yrs").mean() for name, group in subset.groupby("serious_dlqin2yrs"): print "splitting by: ", name print group.mean() print "*"*80 subset.groupby("serious_dlqin2yrs").agg([np.min, np.mean, np.median, np.max]) def age_x_income(frame): x = (frame.age * frame.monthly_income) return np.mean(x) subset.groupby("serious_dlqin2yrs").apply(age_x_income) pop = pd.read_csv("./data/uspop.csv") pop cols = ['age', 'monthly_income', 'serious_dlqin2yrs'] result = pd.merge(df[cols] , pop, how='left', on='age') result len(result) > len(df) pd.value_counts(pop.age).head() pop = pop[pop.age.duplicated()==False] cols = ['age', 'monthly_income', 'serious_dlqin2yrs'] joined = pd.merge(df[cols] , pop, how='left', on='age') pop.tail() joined.est_pop = joined.est_pop.fillna(4957.0) joined.est_pop.describe() from pandasql import sqldf pysqldf = lambda q: sqldf(q, globals()) query = """ select serious_dlqin2yrs , sum(1) as total from df group by serious_dlqin2yrs; """ pysqldf(query) query = """ select age , avg(serious_dlqin2yrs) as pct_delinquent from df group by age order by age; """ pysqldf(query)