import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import d3py
impact = pd.read_csv('SNIP_SJR_complete_1999_2011new_SNIP_and_SJR_v1_Oct_2012.csv')
def count_words(title):
wordCount = sum([1 for word in title.split(' ')])
return wordCount
def impact_regression(df):
clean = df[['Source Title', '2011 SNIP2', 'title_wordcnt']].dropna(how='any')
slope, intercept, r_value, p_value, std_err = stats.linregress(clean['title_wordcnt'], clean['2011 SNIP2'])
print 'Rvalue: {0}\nPvalue: {1}\n\n'.format(r_value, p_value)
plt.scatter(clean['title_wordcnt'], clean['2011 SNIP2']);
impact['title_wordcnt'] = impact['Source Title'].map(count_words)
impact.to_csv('impact_titlelength.csv')
big4.title_wordcnt.describe()
count 20698.000000 mean 4.280075 std 2.504240 min 1.000000 25% 3.000000 50% 4.000000 75% 5.000000 max 31.000000
big4['2011 SNIP2'].describe()
count 12581.000000 mean 0.992255 std 1.067147 min 0.000000 25% 0.427000 50% 0.842000 75% 1.271000 max 41.082000
big4[big4['2011 SNIP2'] >=30][['2011 SNIP2', 'Source Title']]
2011 SNIP2 | Source Title | |
---|---|---|
4874 | 41.082 | CA - A Cancer Journal for Clinicians |
10365 | 32.028 | Foundations and Trends in Information Retrieval |
impact['2011 SJR2'].clip(0, 5).hist(bins=50, color='MediumAquamarine');
plt.title('Histogram of 2011 SJR scores');
plt.xlabel('SJR score');
plt.ylabel('Counts');
impact.title_wordcnt.clip(1, 10).hist(bins=10, color='m', alpha=.4);
plt.title('Title length')
plt.xlabel('Number of words in title')
plt.ylabel('Count');
impact_regression(impact)
Rvalue: -0.0196096947592 Pvalue: 0.00688449443127
usa = impact[impact.Country == 'United States']
impact_regression(usa)
Rvalue: 0.000655392936585 Pvalue: 0.961306145523
impact.Country.value_counts()[:10]
United States 9731 United Kingdom 5961 Netherlands 2617 Germany 2389 France 966 China 758 Japan 722 Italy 703 Switzerland 618 Spain 511
country_list = ['United States', 'United Kingdom', 'Netherlands', 'Germany']
big4 = impact[impact.Country.isin(country_list)]
impact_regression(big4)
Rvalue: -0.014060656878 Pvalue: 0.114787327644
uk = impact[impact.Country == 'United Kingdom']
impact_regression(uk)
Rvalue: -0.0751363159685 Pvalue: 2.11846434762e-06
neth = impact[impact.Country == 'Netherlands']
impact_regression(neth)
Rvalue: -0.0110826798416 Pvalue: 0.639559349463
germ = impact[impact.Country == 'Germany']
impact_regression(germ)
Rvalue: 0.00935943389908 Pvalue: 0.732414716492
simplified = big4[['Country', 'title_wordcnt', '2011 SNIP2']]
simpl = simplified.groupby(by=tmp.Country)
simplified.to_csv('big4_titlelength.csv')
big4
<class 'pandas.core.frame.DataFrame'> Int64Index: 20698 entries, 2 to 32053 Data columns: Source Title 20698 non-null values Print ISSN 20162 non-null values E-ISSN 5574 non-null values Publisher's Name 20606 non-null values Publisher imprints grouped to main Publisher 20605 non-null values Country 20698 non-null values 1999 SNIP2 7738 non-null values 1999 SJR2 12805 non-null values 2000 SNIP2 7939 non-null values 2000 SJR2 12805 non-null values 2001 SNIP2 8575 non-null values 2001 SJR2 12805 non-null values 2002 SNIP2 8929 non-null values 2002 SJR2 12805 non-null values 2003 SNIP2 9533 non-null values 2003 SJR2 12805 non-null values 2004 SNIP2 9792 non-null values 2004 SJR2 12805 non-null values 2005 SNIP2 9993 non-null values 2005 SJR2 12805 non-null values 2006 SNIP2 10598 non-null values 2006 SJR2 12805 non-null values 2007 SNIP2 11079 non-null values 2007 SJR2 12805 non-null values 2008 SNIP2 11435 non-null values 2008 SJR2 12805 non-null values 2009 SNIP2 11772 non-null values 2009 SJR2 12805 non-null values 2010 SNIP2 12267 non-null values 2010 SJR2 12805 non-null values 2011 SNIP2 12581 non-null values 2011 SJR2 12805 non-null values Life sciences 3616 non-null values Social sciences 5873 non-null values Physical sciences 7219 non-null values Top level: Health Sciences 7439 non-null values 1000 General 36 non-null values 1100 Agricultural and Biological Sciences 1179 non-null values 1200 Arts and Humanities 1614 non-null values 1300 Biochemistry, Genetics and Molecular Biology 1625 non-null values 1400 Business, Management and Accounting 1013 non-null values 1500 Chemical Engineering 564 non-null values 1600 Chemistry 677 non-null values 1700 Computer Science 1153 non-null values 1800 Decision Sciences 233 non-null values 1900 Earth and Planetary Sciences 994 non-null values 2000 Economics, Econometrics and Finance 637 non-null values 2100 Energy 300 non-null values 2200 Engineering 2931 non-null values 2300 Environmental Science 1190 non-null values 2400 Immunology and Microbiology 462 non-null values 2500 Materials Science 942 non-null values 2600 Mathematics 861 non-null values 2700 Medicine 6941 non-null values 2800 Neuroscience 405 non-null values 2900 Nursing 483 non-null values 3000 Pharmacology, Toxicology and Pharmaceutics 597 non-null values 3100 Physics and Astronomy 805 non-null values 3200 Psychology 873 non-null values 3300 Social Sciences 3457 non-null values 3400 Veterinary 124 non-null values 3500 Dentistry 104 non-null values 3600 Health Professions 315 non-null values All Science Classification Codes (ASJC) 20500 non-null values Sourcerecord id 20698 non-null values title_wordcnt 20698 non-null values dtypes: float64(26), int64(2), object(38)
simpl.describe()
title_wordcnt | 2011 SNIP2 | ||
---|---|---|---|
Country | |||
Germany | count | 2389.000000 | 1337.000000 |
mean | 3.860611 | 0.692680 | |
std | 2.423934 | 0.617181 | |
min | 1.000000 | 0.000000 | |
25% | 2.000000 | 0.238000 | |
50% | 3.000000 | 0.608000 | |
75% | 5.000000 | 1.018000 | |
max | 23.000000 | 6.801000 | |
Netherlands | count | 2617.000000 | 1788.000000 |
mean | 3.993122 | 1.118182 | |
std | 2.167832 | 0.868283 | |
min | 1.000000 | 0.000000 | |
25% | 2.000000 | 0.597000 | |
50% | 4.000000 | 1.019000 | |
75% | 5.000000 | 1.451500 | |
max | 21.000000 | 11.363000 | |
United Kingdom | count | 5961.000000 | 3974.000000 |
mean | 4.160208 | 0.987923 | |
std | 2.282337 | 0.848802 | |
min | 1.000000 | 0.000000 | |
25% | 2.000000 | 0.502000 | |
50% | 4.000000 | 0.876000 | |
75% | 5.000000 | 1.255000 | |
max | 28.000000 | 13.535000 | |
United States | count | 9731.000000 | 5482.000000 |
mean | 4.533655 | 1.027386 | |
std | 2.702778 | 1.313232 | |
min | 1.000000 | 0.000000 | |
25% | 3.000000 | 0.401000 | |
50% | 4.000000 | 0.813500 | |
75% | 6.000000 | 1.286000 | |
max | 31.000000 | 41.082000 |