import pandas as pd import matplotlib.pyplot as plt from scipy import stats import d3py impact = pd.read_csv('SNIP_SJR_complete_1999_2011new_SNIP_and_SJR_v1_Oct_2012.csv') def count_words(title): wordCount = sum([1 for word in title.split(' ')]) return wordCount def impact_regression(df): clean = df[['Source Title', '2011 SNIP2', 'title_wordcnt']].dropna(how='any') slope, intercept, r_value, p_value, std_err = stats.linregress(clean['title_wordcnt'], clean['2011 SNIP2']) print 'Rvalue: {0}\nPvalue: {1}\n\n'.format(r_value, p_value) plt.scatter(clean['title_wordcnt'], clean['2011 SNIP2']); impact['title_wordcnt'] = impact['Source Title'].map(count_words) impact.to_csv('impact_titlelength.csv') big4.title_wordcnt.describe() big4['2011 SNIP2'].describe() big4[big4['2011 SNIP2'] >=30][['2011 SNIP2', 'Source Title']] impact['2011 SJR2'].clip(0, 5).hist(bins=50, color='MediumAquamarine'); plt.title('Histogram of 2011 SJR scores'); plt.xlabel('SJR score'); plt.ylabel('Counts'); impact.title_wordcnt.clip(1, 10).hist(bins=10, color='m', alpha=.4); plt.title('Title length') plt.xlabel('Number of words in title') plt.ylabel('Count'); impact_regression(impact) usa = impact[impact.Country == 'United States'] impact_regression(usa) impact.Country.value_counts()[:10] country_list = ['United States', 'United Kingdom', 'Netherlands', 'Germany'] big4 = impact[impact.Country.isin(country_list)] impact_regression(big4) uk = impact[impact.Country == 'United Kingdom'] impact_regression(uk) neth = impact[impact.Country == 'Netherlands'] impact_regression(neth) germ = impact[impact.Country == 'Germany'] impact_regression(germ) simplified = big4[['Country', 'title_wordcnt', '2011 SNIP2']] simpl = simplified.groupby(by=tmp.Country) simplified.to_csv('big4_titlelength.csv') big4 simpl.describe()