import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

df = pd.read_csv('https://cdn.rawgit.com/benedict-chan/pytest/master/datasets/loansData.csv')

df.head()

df.dtypes

len(df[ ( df['Interest.Rate'].str.contains('%') == False) ] ) , len(df[ ( df['Debt.To.Income.Ratio'].str.contains('%') == False) ] )

remove_percent_converter = lambda x: float(x.replace('%', ''))

len(df[df['Loan.Length'].str.contains('months') == False])

remove_month_converter = lambda x: float(x.replace('months', ''))

clean_df = pd.read_csv('https://cdn.rawgit.com/benedict-chan/pytest/master/datasets/loansData.csv'
	, converters={'Interest.Rate': remove_percent_converter
	,'Debt.To.Income.Ratio': remove_percent_converter
	, 'Loan.Length': remove_month_converter})

fico_function = lambda x: float(x.split('-')[0])
clean_df['FICO.Score'] = clean_df['FICO.Range'].map(fico_function)

clean_df = clean_df.rename(columns={'Amount.Requested':'Loan.Amount'})

clean_df['NewIndex'] = pd.Series(range(1, clean_df['Amount.Funded.By.Investors'].count()+1), index=clean_df.index)
clean_df = clean_df.set_index('NewIndex')

final_columns = ["Interest.Rate","FICO.Score","Loan.Length","Monthly.Income","Loan.Amount"]
final_df = clean_df[final_columns]

final_df = final_df[final_df['Monthly.Income'] < 100000]

checking_df = pd.read_csv('https://cdn.rawgit.com/benedict-chan/pytest/master/datasets/loanf.csv')
checking_df = checking_df.sort_index()
checking_df.index.name = 'NewIndex'

ne_stacked = pd.concat([final_df, checking_df])
ne_stacked = ne_stacked.reset_index(drop=True)
df_gpby = ne_stacked.groupby(list(ne_stacked.columns))

idx = [x[0] for x in df_gpby.groups.values() if len(x) == 1]

ne_stacked.reindex(idx)