import pandas as pd import matplotlib.pyplot as plt %matplotlib inline df = pd.read_csv('https://cdn.rawgit.com/benedict-chan/pytest/master/datasets/loansData.csv') df.head() df.dtypes len(df[ ( df['Interest.Rate'].str.contains('%') == False) ] ) , len(df[ ( df['Debt.To.Income.Ratio'].str.contains('%') == False) ] ) remove_percent_converter = lambda x: float(x.replace('%', '')) len(df[df['Loan.Length'].str.contains('months') == False]) remove_month_converter = lambda x: float(x.replace('months', '')) clean_df = pd.read_csv('https://cdn.rawgit.com/benedict-chan/pytest/master/datasets/loansData.csv' , converters={'Interest.Rate': remove_percent_converter ,'Debt.To.Income.Ratio': remove_percent_converter , 'Loan.Length': remove_month_converter}) fico_function = lambda x: float(x.split('-')[0]) clean_df['FICO.Score'] = clean_df['FICO.Range'].map(fico_function) clean_df = clean_df.rename(columns={'Amount.Requested':'Loan.Amount'}) clean_df['NewIndex'] = pd.Series(range(1, clean_df['Amount.Funded.By.Investors'].count()+1), index=clean_df.index) clean_df = clean_df.set_index('NewIndex') final_columns = ["Interest.Rate","FICO.Score","Loan.Length","Monthly.Income","Loan.Amount"] final_df = clean_df[final_columns] final_df = final_df[final_df['Monthly.Income'] < 100000] checking_df = pd.read_csv('https://cdn.rawgit.com/benedict-chan/pytest/master/datasets/loanf.csv') checking_df = checking_df.sort_index() checking_df.index.name = 'NewIndex' ne_stacked = pd.concat([final_df, checking_df]) ne_stacked = ne_stacked.reset_index(drop=True) df_gpby = ne_stacked.groupby(list(ne_stacked.columns)) idx = [x[0] for x in df_gpby.groups.values() if len(x) == 1] ne_stacked.reindex(idx)