import pandas as pd import numpy as np xl = pd.io.excel.read_excel('crunchbase_monthly_export_d43b44299ade53.xlsx', sheetname='Companies', index_col='permalink') xl.info() xl=xl[xl.founded_year >= 2000] xl.info() len(xl.market.value_counts()) xl.market.value_counts().head(20) from matplotlib import pyplot as plt %matplotlib inline from mpltools import style style.use('ggplot') xl.market.value_counts().head(30).plot(kind='bar', figsize=(15,8)) xl.country_code.value_counts().head(15) xl.country_code.value_counts().head(15).plot(kind='bar', figsize=(15,8)) xl.founded_year.value_counts(sort=False).tail(25).plot(kind='bar', figsize=(15,8)) xl.status.value_counts() xl.status.value_counts()* 100/len(xl) year_gp = xl.groupby('founded_year') type(year_gp) for year, group in year_gp: print year print group.info() break year_gp = xl[xl.founded_year>=2000].groupby('founded_year') for year, group in year_gp: print year, len(group) xl[xl.founded_year==2015] mask = (xl.founded_year>=2000) & (xl.founded_year<2015) year_gp = xl[(xl.founded_year>=2000) & (xl.founded_year<2015)].groupby('founded_year') xl.status.value_counts() type(xl.status.value_counts()) xl.status.value_counts().operating2 for year, group in year_gp: print year, len(group) print group.status.value_counts() *100/len(group) index, closed, operating, acquired, unknown = [],[],[],[],[] for year, group in year_gp: index.append(year) status = group.status.value_counts() *100/len(group) operating.append(status.operating) try: acquired.append(status.acquired) except AttributeError: acquired.append(0) try: closed.append(status.closed ) except AttributeError: closed.append(0) unknown.append(100 - operating[-1] -acquired[-1] -closed[-1]) closed, operating, acquired, unknown = np.array(closed), np.array(operating), np.array(acquired), np.array(unknown) width = 0.35 plt.figure(figsize=(15,5)) p1 = plt.bar(index, operating, width, color='g') p2 = plt.bar(index, acquired, width, bottom=operating, color='b') p3 = plt.bar(index, closed, width, bottom=acquired+operating, color='r') p4 = plt.bar(index, unknown, width, bottom=closed+acquired+operating, color='y') plt.ylabel('Success %') plt.title('Percentage of Startup Success by Year') #plt.xticks([int(i) for i in index]) #plt.yticks(xrange(0,101,10)) plt.legend( (p1[0], p2[0], p3[0], p4[0]), ('Operating', 'Acquired','Closed', 'Unknown'), bbox_to_anchor=(1.1, 1.05) ) xl.info() import datetime first_funding_at = pd.to_datetime(xl.first_funding_at, dayfirst=True, errors='ignore') print len(first_funding_at) first_funding_at = first_funding_at[[isinstance(value, datetime.datetime) for index, value in first_funding_at.iteritems() ]] print len(first_funding_at) founded_at = pd.to_datetime(xl.founded_at, dayfirst=True) founded_at = founded_at[[isinstance(value, datetime.datetime) for index, value in founded_at.iteritems()]] daysdiff = first_funding_at - founded_at daysdiff = daysdiff[pd.notnull(daysdiff)] dayslist=[value.days for index, value in daysdiff.iteritems()] plt.figure(figsize=(15,8)) graph = plt.hist(dayslist, bins=range(0, 6000, 365)) plt.figure(figsize=(15,8)) graph = plt.hist(dayslist, bins=[0, 30, 60, 90, 180, 365, 365+180, 2*365, 2*365 + 180, 3*365]) daysdiff[daysdiff<30] daysdiff[(daysdiff<30)&(daysdiff>0)] rounds = pd.io.excel.read_excel('crunchbase_monthly_export_d43b44299ade53.xlsx', sheetname='Rounds', index_col='funding_round_permalink') rounds.info() fund_type = rounds.funding_round_type.value_counts() fund_type fund_type.plot(kind='pie', figsize=(10,10)) recent_funding = rounds[rounds.funded_year >= 2000] funding_sum = recent_funding[['funded_quarter','raised_amount_usd']].groupby('funded_quarter').sum() funding_sum.plot(kind='bar', figsize=(15, 8)) q3_2006 = recent_funding[recent_funding.funded_quarter=='2006-Q3'].sort("raised_amount_usd", ascending=False) q3_2006[['company_name', 'company_market', 'raised_amount_usd']].head() venture_funding = rounds[(rounds.funded_year >= 2000) & (rounds.funding_round_type == 'venture')] venture_sum = venture_funding[['funded_quarter','raised_amount_usd']].groupby('funded_quarter').sum() venture_sum.plot(kind='bar', figsize=(15, 8)) seed_funding = rounds[(rounds.funded_year >= 2000) & ((rounds.funding_round_type == 'seed') | (rounds.funding_round_type=='angel'))] seed_sum = seed_funding[['funded_quarter','raised_amount_usd']].groupby('funded_quarter').sum() seed_sum.plot(kind='bar', figsize=(15, 8)) investments = pd.io.excel.read_excel('crunchbase_monthly_export_d43b44299ade53.xlsx', sheetname='Investments', index_col='funding_round_permalink') investments.info() invest_gp = investments[['investor_permalink', 'raised_amount_usd']]\ .groupby('investor_permalink').sum() type(invest_gp) invest_gp.sort('raised_amount_usd', ascending=False).head() investments[investments.investor_country_code == 'IND'] invest_ind = investments[investments.company_country_code=='IND']\ [['investor_permalink', 'raised_amount_usd']]\ .groupby('investor_permalink').sum() len(invest_ind) invest_us.sort('raised_amount_usd', ascending=False).head(10) cross_country_investing = investments[(investments.company_country_code != investments.investor_country_code) & pd.notnull(investments.company_country_code) & pd.notnull(investments.investor_country_code)] cross_country_investing = cross_country_investing[['investor_country_code', 'company_country_code', 'company_permalink']] cross_country_investing.info() invest_count = cross_country_investing.groupby(['investor_country_code','company_country_code']).count() invest_count.ix['IND'] invest_count.ix['USA'] invest_count.ix[('USA','IND')] investments[['raised_amount_usd','company_market']]\ .groupby('company_market')\ .sum()\ .sort('raised_amount_usd',ascending=False) investments[investments.company_country_code == 'IND']\ [['raised_amount_usd','company_market']]\ .groupby('company_market')\ .sum()\ .sort('raised_amount_usd',ascending=False).head(20) investments[(investments.company_country_code == 'IND') | (investments.funded_year >=2013)]\ [['raised_amount_usd','company_market']]\ .groupby('company_market')\ .sum()\ .sort('raised_amount_usd',ascending=False).head(20) investments.ix[1] acquisitions = pd.io.excel.read_excel('crunchbase_monthly_export_d43b44299ade53.xlsx', sheetname='Acquisitions', index_col='company_permalink') acquisitions.info() acquisitions = acquisitions[acquisitions.acquired_year>=2000] acquisitions[['acquirer_permalink','company_name']]\ .groupby("acquirer_permalink")\ .count()\ .sort('company_name',ascending=False) investments.info() yearly_invest = investments[investments.funded_year>=2000][['funded_year','company_market']].groupby('funded_year') index = [] from collections import defaultdict plotpoints = defaultdict(dict) for year, group in yearly_invest: index.append(year) rank = 1 for key, value in group.company_market.value_counts().head(10).iteritems(): plotpoints[key][year] = rank rank = rank + 1 with plt.style.context('fivethirtyeight'): plt.figure(figsize=(15,8)) plt.gca().invert_yaxis() for market,points in plotpoints.iteritems(): values =[points.get(year, 11) for year in index] plt.plot(index, values) #plt.plot(x, np.sin(x) + x + np.random.randn(50)) #plt.plot(x, np.sin(x) + 0.5 * x + np.random.randn(50)) #plt.plot(x, np.sin(x) + 2 * x + np.random.randn(50))