# Next line commented out because we only want to run this once #!python src/load_kiva_lenders_to_mongodb.py # Next line commented out because we only want to run this once #!python src/load_kiva_loans_lenders_to_mongodb.py !python src/convert_mongodb_loans_to_dataframe.py --dataDir data/predicting_funding \ --baseName loans \ --startYear 2006 \ --endYear 2014 %matplotlib inline import pandas as pd import numpy as np import sys import matplotlib.pyplot as plt from matplotlib.dates import date2num from pandas.io.pytables import read_hdf import matplotlib.dates as mdates loansDataFrameFile = 'data/predicting_funding/loans_dataframe.h5' print >> sys.stderr, "Reading loans dataframe file %s ..." % loansDataFrameFile, loansDF = read_hdf(loansDataFrameFile, 'table') print >> sys.stderr, "done" #print "loansDF.shape = ", loansDF.shape # Map date to the first day of the quarter loansDF['posted_quarter'] = loansDF['posted_date'].apply( lambda x: x.replace(hour=0, minute=0, second=0, day=1, month=((x.month / 4)*3)+1 )) loansDF['fully_funded'] = loansDF['funded_amount'] >= loansDF['loan_amount'] #loansDF.head(3) loansDFGroupedByMonthAndGender = loansDF.groupby(['posted_quarter','borrower_majority_gender','fully_funded']) XSeries = [] YSeries = {'M': {True: {}, False: {}}, 'F': {True: {}, False: {}}, 'N': {True: {}, False: {}}} for k,v in sorted(loansDFGroupedByMonthAndGender.groups.items(), key=lambda x : x[0][0]): period, gender, isFullyFunded = k count = len(v) XSeries.append(period) YSeries[gender][isFullyFunded][period] = count for p in XSeries: for gender in ['M', 'F', 'N']: for isFullyFunded in [True, False]: if not YSeries[gender][isFullyFunded].has_key(p): YSeries[gender][isFullyFunded][p] = 0 YSeriesMaleNotFullyFunded = [YSeries['M'][False][p] for p in XSeries] YSeriesMaleFullyFunded = [YSeries['M'][True][p] for p in XSeries] YSeriesFemaleNotFullyFunded = [YSeries['F'][False][p] for p in XSeries] YSeriesFemaleFullyFunded = [YSeries['F'][True][p] for p in XSeries] # Negligeable #YSeriesNeutralNotFullyFunded = [YSeries['N'][False][p] for p in XSeries] #YSeriesNeutralFullyFunded = [YSeries['N'][True][p] for p in XSeries] width = 20 fig = plt.figure(40, figsize=(12,12)) ax = fig.add_subplot(111) ax.bar(date2num(XSeries), YSeriesMaleNotFullyFunded,label="male, not fully funded",width=width,color='lightblue',align='center') ax.bar(date2num(XSeries), YSeriesMaleFullyFunded,bottom=YSeriesMaleNotFullyFunded,label="male, fully funded",width=width,color='blue',align='center') ax.bar(date2num(XSeries)+30, YSeriesFemaleNotFullyFunded,label="female, not fully funded", width=width,color='lightgreen',align='center') ax.bar(date2num(XSeries)+30, YSeriesFemaleFullyFunded, bottom=YSeriesFemaleNotFullyFunded ,label="female, fully funded", width=width,color='green',align='center') ax.xaxis_date() plt.setp(plt.gca().get_yticklabels(), fontsize="25") plt.setp(plt.gca().get_xticklabels(), fontsize="25", ha="left") ax.autoscale(tight=True) plt.legend(loc="best", fontsize="25") plt.title('Number of loans per quarter\nsplit by gender and funding status', fontsize="25", y=1.03) plt.ylabel('Number of loans', fontsize="25") ax.yaxis.labelpad = 20 plt.show() #print loansDF.head() amountXSeries = [] amountYSeries = {'M': {True: {}, False: {}}, 'F': {True: {}, False: {}}, 'N': {True: {}, False: {}}} for k,v in sorted(loansDFGroupedByMonthAndGender.groups.items(), key=lambda x : x[0][0]): period, gender, isFullyFunded = k row = loansDF.loc[v] loanAmountSum = sum(row['loan_amount']) # print "loanAmountSum = ", loanAmountSum amountXSeries.append(period) if amountYSeries[gender][isFullyFunded].has_key(period): # print "increasing %s,%s,%s from %d to" % (gender, isFullyFunded, period, amountYSeries[gender][isFullyFunded][period]), amountYSeries[gender][isFullyFunded][period] += loanAmountSum # print "%d" % amountYSeries[gender][isFullyFunded][period] else: amountYSeries[gender][isFullyFunded][period] = loanAmountSum # print "setting %s,%s,%s to %d" % (gender, isFullyFunded, period, amountYSeries[gender][isFullyFunded][period]) for p in amountXSeries: for gender in ['M', 'F', 'N']: for isFullyFunded in [True, False]: if not amountYSeries[gender][isFullyFunded].has_key(p): amountYSeries[gender][isFullyFunded][p] = 0 #print amountYSeries amountYSeriesMaleNotFullyFunded = [amountYSeries['M'][False][p] for p in amountXSeries] amountYSeriesMaleFullyFunded = [amountYSeries['M'][True][p] for p in amountXSeries] amountYSeriesFemaleNotFullyFunded = [amountYSeries['F'][False][p] for p in amountXSeries] amountYSeriesFemaleFullyFunded = [amountYSeries['F'][True][p] for p in amountXSeries] # Negligeable #amountYSeriesNeutralNotFullyFunded = [amountYSeries['N'][False][p] for p in amountXSeries] #amountYSeriesNeutralFullyFunded = [amountYSeries['N'][True][p] for p in amountXSeries] width = 20 fig = plt.figure(40, figsize=(12,12)) ax1 = fig.add_subplot(111) ax1.bar(date2num(amountXSeries), amountYSeriesMaleNotFullyFunded,label="male, not fully funded",width=width,color='lightblue',align='center') ax1.bar(date2num(amountXSeries), amountYSeriesMaleFullyFunded,bottom=amountYSeriesMaleNotFullyFunded,label="male, fully funded",width=width,color='blue',align='center') ax1.bar(date2num(amountXSeries)+30, amountYSeriesFemaleNotFullyFunded,label="female, not fully funded", width=width,color='lightgreen',align='center') ax1.bar(date2num(amountXSeries)+30, amountYSeriesFemaleFullyFunded, bottom=amountYSeriesFemaleNotFullyFunded ,label="female, fully funded", width=width,color='green',align='center') ax1.xaxis_date() plt.setp(plt.gca().get_yticklabels(), fontsize="20") plt.setp(plt.gca().get_xticklabels(), fontsize="20", ha="left") ax.autoscale(tight=True) plt.legend(loc="best", fontsize="20") plt.title('Total requested loan amounts per quarter\nsplit by gender and funding status', fontsize="20", y=1.03) plt.ylabel('USD', fontsize="20") ax.yaxis.labelpad = 20 plt.show() loanXSeries = {'M': {True: [], False: []}, 'F': {True: [], False: []}, 'N': {True: [], False: []}} loanYSeries = {'M': {True: {}, False: {}}, 'F': {True: {}, False: {}}, 'N': {True: {}, False: {}}} for k,v in sorted(loansDFGroupedByMonthAndGender.groups.items(), key=lambda x : x[0][0]): period, gender, isFullyFunded = k row = loansDF.loc[v] loanAmountAverage = sum(row['loan_amount']) / float(len(row['loan_amount'])) fundedAmountAverage = sum(row['funded_amount']) / float(len(row['funded_amount'])) lenderCountAverage = sum(row['lender_count']) / float(len(row['lender_count'])) loanXSeries[gender][isFullyFunded].append(period) loanYSeries[gender][isFullyFunded][period] = {'loanAmountAvg': loanAmountAverage, 'fundedAmountAvg': fundedAmountAverage, 'lenderCountAvg' : lenderCountAverage} loanYSeriesMaleNotFullyFunded = [loanYSeries['M'][False][p]['loanAmountAvg'] for p in loanXSeries['M'][False]] loanYSeriesMaleFullyFunded = [loanYSeries['M'][True][p]['loanAmountAvg'] for p in loanXSeries['M'][True]] fundedYSeriesMaleNotFullyFunded = [loanYSeries['M'][False][p]['fundedAmountAvg'] for p in loanXSeries['M'][False]] lenderCountYSeriesMaleNotFullyFunded = [loanYSeries['M'][False][p]['lenderCountAvg'] for p in loanXSeries['M'][False]] lenderCountYSeriesMaleFullyFunded = [loanYSeries['M'][True][p]['lenderCountAvg'] for p in loanXSeries['M'][True]] loanYSeriesFemaleNotFullyFunded = [loanYSeries['F'][False][p]['loanAmountAvg'] for p in loanXSeries['F'][False]] loanYSeriesFemaleFullyFunded = [loanYSeries['F'][True][p]['loanAmountAvg'] for p in loanXSeries['F'][True]] fundedYSeriesFemaleNotFullyFunded = [loanYSeries['F'][False][p]['fundedAmountAvg'] for p in loanXSeries['F'][False]] lenderCountYSeriesFemaleNotFullyFunded = [loanYSeries['F'][False][p]['lenderCountAvg'] for p in loanXSeries['F'][False]] lenderCountYSeriesFemaleFullyFunded = [loanYSeries['F'][True][p]['lenderCountAvg'] for p in loanXSeries['F'][True]] width = 20 fig = plt.figure(60, figsize=(12,8)) ax2 = fig.add_subplot(2,1,1) ax2.plot(date2num(loanXSeries['M'][False]), loanYSeriesMaleNotFullyFunded, '+-', label="avg. loan amount, male, not fully funded", color='blue') ax2.plot(date2num(loanXSeries['M'][False]), fundedYSeriesMaleNotFullyFunded, '*-', label="avg. funded amount, male, not fully funded", color='blue') ax2.plot(date2num(loanXSeries['M'][True]), loanYSeriesMaleFullyFunded, 'o-', label="avg. loan=funded amount, male, fully funded", color='blue') ax2.plot(date2num(loanXSeries['F'][False]), loanYSeriesFemaleNotFullyFunded, '+-', label="avg. loan amount, female, not fully funded", color='green') ax2.plot(date2num(loanXSeries['F'][False]), fundedYSeriesFemaleNotFullyFunded, '*-', label="avg. funded amount, female, not fully funded", color='green') ax2.plot(date2num(loanXSeries['F'][True]), loanYSeriesFemaleFullyFunded, 'o-', label="avg. loan=funded amount, female, fully funded", color='green') ax2.xaxis_date() ax2.autoscale(tight=True) ax2.set_ylabel('USD') ax2.legend(loc="best") ax2.set_title('Loan characteristics per quarter, split by gender and funding status') ax3 = fig.add_subplot(2,1,2) ax3.plot(date2num(loanXSeries['M'][True]), lenderCountYSeriesMaleFullyFunded, 'o-', label="male, fully funded", color='blue') ax3.plot(date2num(loanXSeries['F'][True]), lenderCountYSeriesFemaleFullyFunded, 'o-', label="female, fully funded", color='green') ax3.plot(date2num(loanXSeries['M'][False]), lenderCountYSeriesMaleNotFullyFunded, 'x-', label="male, not fully funded", color='blue') ax3.plot(date2num(loanXSeries['F'][False]), lenderCountYSeriesFemaleNotFullyFunded, 'x-', label="female, not fully funded", color='green') ax3.set_ylabel("Number of lenders per loan") ax3.xaxis_date() ax3.autoscale(tight=True) ax3.set_title('Average number of lenders per loan, split by gender and funding status') ax3.legend(loc="best") plt.show() !python src/calculate_country2country_loan_flow.py --inDataDir data/predicting_funding \ --isoCountryCodesFile data/predicting_funding/iso3166_country_codes.tsv \ --outDataDir data/predicting_funding \ --outBaseName kiva \ --startYear 2012 \ --endYear 2014 \ --minValue 5000 !cp data/predicting_funding/kiva_country2country_loan_flows.json d3/data.json %%HTML %matplotlib inline import pandas as pd import numpy as np import sys import matplotlib.pyplot as plt from matplotlib.dates import date2num from pandas.io.pytables import read_hdf loansDataFrameFile = 'data/predicting_funding/loans_dataframe.h5' print >> sys.stderr, "Reading loans dataframe file %s ..." % loansDataFrameFile, loansDF = read_hdf(loansDataFrameFile, 'table') print >> sys.stderr, "done" # We're only interested in loans from the period 2012-2014 that were not fully funded loansDF['funding_ratio'] = loansDF['funded_amount'] / loansDF['loan_amount'] nonFundedLoansDF = loansDF[loansDF['funding_ratio'] < 1.0] #nonFundedLoansDF.head(3) fundingRatioSeries = nonFundedLoansDF['funding_ratio'].values print "number of not-fully funded loans in 2012-2014 period:", len(fundingRatioSeries) plt.title("Distribution of funding ratio\nfor not-fully funded loans applications") for nrBins in [2,3,4,5,10,20,100]: plt.hist(fundingRatioSeries, nrBins, label="%d bins" % nrBins) plt.legend() plt.show() nonFundedLoansFemaleDF = loansDF[(loansDF['funding_ratio'] < 1.0) & (loansDF['borrower_majority_gender'] == 'F')] nonFundedLoansMaleDF = loansDF[(loansDF['funding_ratio'] < 1.0) & (loansDF['borrower_majority_gender'] == 'M')] FemaleFundingRatioSeries = nonFundedLoansFemaleDF['funding_ratio'].values MaleFundingRatioSeries = nonFundedLoansMaleDF['funding_ratio'].values print "number of not-fully funded female loans in 2012-2014 period:", len(FemaleFundingRatioSeries) print "number of not-fully funded male loans in 2012-2014 period:", len(MaleFundingRatioSeries) plt.title("Distribution of funding ratio\nfor not-fully funded loans applications by FEMALES") for nrBins in [2,3,4,5,10,20,100]: plt.hist(FemaleFundingRatioSeries, nrBins, label="%d bins" % nrBins) plt.legend() plt.show() plt.title("Distribution of funding ratio\nfor not-fully funded loans applications by MALES") for nrBins in [2,3,4,5,10,20,100]: plt.hist(MaleFundingRatioSeries, nrBins, label="%d bins" % nrBins) plt.legend() plt.show() !python src/convert_mongodb_to_blei_ldac.py --dataDir data/predicting_funding\ --corpusBaseName kiva \ --stopwordFile=data/predicting_funding/kiva_stopwords.tsv \ --startYear 2012 \ --endYear 2014 \ --maxNrDocs 1000000 \ --filterBelow 10 \ --filterAbove 0.5 \ --filterKeepN 1000 \ --classificationField funding_ratio \ --classLabelFileName kiva-labels-with-loan-ids.dat !python src/prepare_sdla_train_test_sets.py --bleiCorpusFile data/predicting_funding/kiva.lda-c \ --bleiLabelFile data/predicting_funding/kiva-labels-with-loan-ids.dat \ --bleiTrainCorpusFile data/predicting_funding/slda_in/kiva-train-data.dat \ --bleiTrainLabelFile data/predicting_funding/slda_in/kiva-train-label.dat \ --bleiTrainIdFile data/predicting_funding/slda_in/kiva-train-loan-ids.dat \ --bleiTestCorpusFile data/predicting_funding/slda_in/kiva-test-data.dat \ --bleiTestLabelFile data/predicting_funding/slda_in/kiva-test-label.dat \ --bleiTestIdFile data/predicting_funding/slda_in/kiva-test-loan-ids.dat \ --test_size 4000 \ --train_size 36000 !/Users/frederik/install/slda-master/slda est \ data/predicting_funding/slda_in/kiva-train-data.dat \ data/predicting_funding/slda_in/kiva-train-label.dat \ data/predicting_funding/slda_settings.txt 1.0 20 \ seeded \ data/predicting_funding/slda_out !/Users/frederik/install/slda-master/slda inf \ data/predicting_funding/slda_in/kiva-test-data.dat \ data/predicting_funding/slda_in/kiva-test-label.dat \ data/predicting_funding/slda_settings.txt \ data/predicting_funding/slda_out/final.model \ data/predicting_funding/slda_out !python src/evaluate_sdla.py --predictedFile data/predicting_funding/slda_out/inf-labels.dat \ --expectedFile data/predicting_funding/slda_in/kiva-test-label.dat \ --average weighted !python src/print_topics.py --vocabFile data/predicting_funding/kiva.lda-c.vocab \ --sldaModelFile data/predicting_funding/slda_out/final.model.text !python src/loan_funding_predictor_poc.py --trainSldaGammaFile data/predicting_funding/slda_out/final.gamma \ --trainLabelFile data/predicting_funding/slda_in/kiva-train-label.dat \ --trainIdFile data/predicting_funding/slda_in/kiva-train-loan-ids.dat \ --testSldaGammaFile data/predicting_funding/slda_out/inf-gamma.dat \ --testLabelFile data/predicting_funding/slda_out/inf-labels.dat \ --testIdFile data/predicting_funding/slda_in/kiva-test-loan-ids.dat \ --feat "slda" \ --feat "borrower_majority_gender" \ --feat "loan_amount" \ --feat "has_image" \ --feat "posted_day_of_month" \ --feat "posted_month" \ --feat "geo_lat" \ --feat "geo_lon" \ --feat "repayment_term" \ --feat "nr_borrowers" \ --feat "bonus_credit_eligibility" \ --feat "translated" \ --feat "partner_rating" \ --feat "partner_delinquency_rate" \ --feat "partner_loans_posted" \ --feat "partner_total_amount_raised" \ --feat "en_description_length" \ --feat "constant" \ !python src/loan_funding_predictor.py --startYear 2012 \ --endYear 2014 \ --logResModelFile data/predicting_funding/logres_out/kivaLoanFundingPredictor.pkl %matplotlib inline import matplotlib.pyplot as plt import numpy as np fig = plt.figure(20, figsize=(8,8)) months = ['Mar', 'Feb', 'Jan', 'Apr', 'Oct', 'Nov', 'Aug', 'Jul', 'May', 'Dec', 'Sep', 'Jun'] logresCoefficients = [0.49, 0.42, 0.35, 0.12, -0.04, -0.11, -0.12, -0.12, -0.15, -0.17, -0.20, -0.28] index = np.arange(12) bar_width = 0.50 plt.bar(index, logresCoefficients, bar_width) plt.xlabel('Month', fontsize="20") plt.ylabel('Impact on positive outcome (full funding) ', fontsize="20") plt.title('LogRes Model Coefficients for PostedMonth features', fontsize="20", y=1.03) plt.xticks(index + bar_width, months) plt.setp(plt.gca().get_yticklabels(), fontsize="20") plt.setp(plt.gca().get_xticklabels(), fontsize="20", ha="center", rotation=45) #plt.autoscale(tight=True) plt.tight_layout() plt.show() %matplotlib inline import matplotlib.pyplot as plt import numpy as np fig = plt.figure(20, figsize=(20,8)) ax = fig.add_subplot(111) topics = [['one', 'group', 'fund', 'farmers', 'season', 'also', 'harvest', 'farmer', 'year', 'savings'], ['small', 'help', 'used', 'town', 'expand', 'requested', 'large', 'hopes', 'brac', 'lives'], ['selling', 'increase', 'shop', 'capital', 'clothing', 'started', 'sells', 'clothes', 'ago', 'working'], ['milk', 'cattle', 'also', 'livestock', 'farm', 'raising', 'two', 'purchase', 'pigs', 'cows'], ['use', 'five', 'married', 'hopes', 'goal', 'future', 'profits', 'plans', 'previous', 'kes'], ['needs', 'sell', 'market', 'support', 'financial', 'provide', 'opportunity', 'price', 'higher', 'however'], ['husband', 'woman', 'like', 'married', 'man', 'would', 'money', 'family', 'hardworking', 'purchase'], ['women', 'members', 'sells', 'profit', 'drinks', 'grocery', 'various', 'made', 'cosmetics', 'live'], ['able', 'work', 'support', 'thanks', 'help', 'continue', 'service', 'lenders', 'little', 'due'], ['school', 'mother', 'education', 'pay', 'father', 'young', 'year', 'single', 'parents', 'daughter'], ['improve', 'living', 'since', 'requesting', 'old', 'partner', 'services', 'married', 'experience', 'car'], ['income', 'earn', 'farming', 'rice', 'living', 'hopes', 'main', 'per', 'day', 'two'], ['house', 'home', 'lives', 'old', 'son', 'wife', 'vegetables', 'materials', 'live', 'requesting'], ['city', 'time', 'works', 'day', 'making', 'located', 'well', 'says', 'bank', 'work'], ['family', 'income', 'new', 'conditions', 'local', 'start', 'cover', 'purchase', 'credit', 'lives'], ['store', 'php', 'additional', 'future', 'nwtf', 'requested', 'general', 'save', 'earns', 'philippines'], ['products', 'customers', 'store', 'good', 'many', 'quality', 'sales', 'life', 'offer', 'household'], ['better', 'work', 'life', 'corn', 'supplies', 'give', 'crops', 'land', 'worked', 'basic'], ['school', 'food', 'water', 'sugar', 'selling', 'married', 'oil', 'six', 'four', 'rice'], ['community', 'meet', 'every', 'help', 'household', 'week', 'expenses', 'clients', 'daily', 'fellowship']] topicIndicesAndCoefficients = [ (11, 0.34463381927852504), (13, 0.28708132847232587), (18, 0.21475821305783774), (9, 0.20934126188765559), (5, 0.20686539023306405), (19, 0.15099384642581043), (4, 0.078019280568203023), (14, 0.061570138754807865), (17, 0.042947945859326486), (3, 0.023512600959147277), (15, 0.019221226724594188), (8, 0.0070316234981432591), (10, -0.026241133761063538) , (1, -0.071793214438988479) , (7, -0.078008030184428076), (6, -0.10954977745880268) , (0, -0.14755071028433084), (2, -0.1481791870952158) , (12, -0.16987378700739811) , (16, -0.17389029153368829)] Xtopics = ["topic_%d" % (topicIndicesAndCoefficients[i][0]) for i in range(len(topics))] topicLogresCoefficients = [topicIndicesAndCoefficients[i][1] for i in range(len(topics))] index = np.arange(len(topics)) bar_width = 0.70 plt.bar(index, topicLogresCoefficients, bar_width) plt.xlabel('Topic', fontsize="20") plt.ylabel('Impact on positive outcome (full funding) ', fontsize="20") plt.title('LogRes Model Coefficients for topical features', fontsize="20", y=1.03) plt.xticks(index + bar_width, Xtopics) for i in range(len(topics)): coeff = topicLogresCoefficients[i] string = "\n".join(topics[i]) if coeff >= 0: y = -0.13 else: y = 0.02 ax.annotate(string, xy=(index[i]+0.05, y), xytext=(index[i]+0.05, y), fontsize="10") plt.setp(plt.gca().get_yticklabels(), fontsize="20") plt.setp(plt.gca().get_xticklabels(), fontsize="10", ha="right", rotation=0) plt.autoscale(tight=True) plt.tight_layout() plt.show() %matplotlib inline import matplotlib.pyplot as plt import numpy as np fig = plt.figure(20, figsize=(12,8)) rankedFeaturesAndCoeffs = [ ('TotalAmountRaised', 0.51512673372977458), ('Log10NumberOfBorrowers', 0.48883803701379364), ('GeoLongitude', 0.38565914705597004), ('Rating', 0.077958867518446537), ('Log10EnglishDescriptionLength', 0.076418455975725913), ('DelinquencyRate', 0.0099085728485939755), ('GeoLatitude', -0.071888473425193139), ('BonusCreditEligibility', -0.16710379963027269), ('RepaymentTerm', -0.28978714414735179), ('LoansPosted', -0.53589567168998353), ('MajorityGender', -0.584374830570077), ('Log10LoanAmount', -1.4442547937444266)] features = ["%s" % (rankedFeaturesAndCoeffs[i][0]) for i in range(len(rankedFeaturesAndCoeffs))] print features logresCoeffs = [rankedFeaturesAndCoeffs[i][1] for i in range(len(rankedFeaturesAndCoeffs))] print logresCoeffs index = np.arange(len(rankedFeaturesAndCoeffs)) bar_width = 0.50 plt.bar(index, logresCoeffs, bar_width) plt.xlabel('Other features', fontsize="20") plt.ylabel('Impact on positive outcome\n(full funding) ', fontsize="20") plt.title('LogRes Model Coefficients for remaining features', fontsize="20", y=1.03) plt.xticks(index + bar_width, features) plt.setp(plt.gca().get_yticklabels(), fontsize="20") plt.setp(plt.gca().get_xticklabels(), fontsize="16", ha="right", rotation=45) plt.autoscale(tight=True) plt.tight_layout() plt.show()