from __future__ import division import pandas as pd #pd.set_printoptions(max_rows=100, max_columns=10) pd.set_option("display.max_rows", 100) pd.set_option("display.max_columns", 10) import numpy as np from scipy import stats import matplotlib.pyplot as plt impact = pd.read_csv('../../SNIP_SJR_complete_1999_2011new_SNIP_and_SJR_v1_Oct_2012.csv') open_access = pd.read_csv('../data/open_access_journals.csv') def rm_issn_punc(x): import re punc = re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ", x) space = ''.join(punc.split(" ")) return space def strip_space(x): try: new = int(x) except Exception as e: new = str(x).strip(' ') return new def membership(x): blean = True open_lst = np.array(open_access.issn) if x in open_lst: blean = False return blean open_access['issn_'] = open_access['ISSN'].map(rm_issn_punc) open_access['issn'] = open_access['issn_'].map(strip_space) impact['issn'] = impact['Print ISSN'].map(strip_space) impact.issn = impact.issn.replace('nan', np.nan) open_access.issn = open_access.issn.replace('nan', np.nan) impact = impact.drop_duplicates(cols='issn') open_access = open_access.drop_duplicates(cols='issn') impact['closed'] = impact['issn'].map(membership) closed = impact[impact.closed != False] impact.closed.value_counts() matches = pd.merge(impact, open_access, left_on=impact['issn'], right_on=['issn']).drop_duplicates() matches = matches.dropna(how='all') matches = matches.drop_duplicates(cols='Title') matches['Country'] = matches['Country_x'] matches[['Source Title', 'Title']].head(20) print 'Closed: {0}, Open: {1}, Full list: {2}'.format(len(closed.issn.dropna().unique()), len(open_access.issn.dropna().unique()), len(impact.issn.dropna().unique())) top_impact_all = impact[['Source Title', '2011 SNIP2']].copy() top_impact_all = pd.DataFrame(top_impact_all.sort('2011 SNIP2', ascending=False).dropna(), columns=['Source Title', '2011 SNIP2']) top_impact_all['2011 SJR2'] = impact['2011 SJR2'] top_impact_all['Difference'] = top_impact_all['2011 SNIP2'] - top_impact_all['2011 SJR2'] top_impact_all.head(15) top_impact_all.sort('2011 SJR2', ascending=False).dropna().head(15) open_lang = open_access.Language.value_counts().head(10)/len(open_access)*100 open_lang plt.figure() open_lang.plot(kind='bar', title='Most common languages, open source journals (%)', color='green', alpha=.3); plt.show() open_lang = open_access.Keyword.value_counts().head(15)/len(open_access)*100 open_lang open_lang.plot(kind='bar', title='Most common keywords, open source journals (%)', color='green', alpha=.3); open_access.to_csv('open_access.csv') plt.figure() open_access['Start Year'].hist(range=(1980, 2012), bins=30, color='green', alpha=.3) plt.title('Histogram of start year, open access journals'); timeline = open_access.sort('Start Year') timeline[['Title', 'Start Year', 'End Year']].head(10) fee = open_access['Publication fee'].value_counts()/len(open_access)*100 fee fee.plot(kind='bar', title='Histogram of fee required (%)', color='green', alpha=.3, rot=0); len(open_access) - len(matches) closed_field = closed[['Physical sciences', 'Life sciences', 'Social sciences']].count()/len(closed) open_field = matches[['Physical sciences', 'Life sciences', 'Social sciences']].count()/len(matches) closed_field.plot(color='red', kind='bar', alpha=.7); tmp = pd.DataFrame(open_field, columns=['Open access']) tmp['Closed access'] = closed_field plt.figure() tmp.plot(kind='bar', color=['green', 'red'], alpha=.5, title='Comparison of discipline', rot=0); countries = pd.DataFrame(closed.Country.value_counts(), columns=['closed']) countries['open'] = open_access.Country.value_counts() countries['proportion_oa'] = countries['open']/countries['closed'] countries_sorted = countries.sort('proportion_oa', ascending=False) countries_sorted[countries_sorted.proportion_oa >= 0][:10] plt.figure() countries.closed.head(10).plot(color='red', kind='bar', alpha=.5) countries.open.head(10).plot(color='green', kind='bar', alpha=.6, title='Comparison of top journal producers (%)\nGreen=Open access, red=Closed access', rot=30); countries_sorted countries_sorted.proportion_oa[countries_sorted.proportion_oa > 0].head(10).plot(kind='bar', color ='g', rot=30, alpha=.5, title ='Countries with highest proportion of OA journals (%)'); snip_dist = pd.DataFrame(closed['2011 SNIP2'], columns=['2011 closed SNIP']) snip_dist['2011 open SNIP'] = matches['2011 SNIP2'] snip_dist[snip_dist['2011 closed SNIP'] <15].boxplot(sym='m+'); snip_dist.describe() sjr_dist = pd.DataFrame(closed['2011 SJR2'], columns=['2011 closed SJR']) sjr_dist['2011 open SJR'] = matches['2011 SJR2'] sjr_dist[sjr_dist['2011 closed SJR']<15].boxplot(sym='m+'); sjr_dist.describe() def find_snip(db): snip_out = db[['1999 SNIP2', '2000 SNIP2', '2001 SNIP2', '2002 SNIP2', '2003 SNIP2', '2004 SNIP2', '2005 SNIP2', '2006 SNIP2', '2007 SNIP2', '2008 SNIP2', '2009 SNIP2', '2010 SNIP2', '2011 SNIP2']] return snip_out def select_snip_by_era(country_db): snip = find_snip(country_db) era1 = snip[country_db['Start Year'] < 1996] era2 = snip[(country_db['Start Year'] >= 1996) & (country_db['Start Year'] <= 2001)] era3 = snip[country_db['Start Year'] > 2001] return snip, era1, era2, era3 def find_sjr(db): sjr_out = db[['1999 SJR2', '2000 SJR2', '2001 SJR2', '2002 SJR2', '2003 SJR2', '2004 SJR2', '2005 SJR2', '2006 SJR2', '2007 SJR2', '2008 SJR2', '2009 SJR2', '2010 SJR2', '2011 SJR2']] return sjr_out def select_sjr_by_era(country_db): sjr = find_sjr(country_db) era1 = sjr[country_db['Start Year'] < 1996] era2 = sjr[(country_db['Start Year'] >= 1996) & (country_db['Start Year'] <= 2001)] era3 = sjr[country_db['Start Year'] > 2001] return sjr, era1, era2, era3 open_years = find_snip(matches) closed_years = find_snip(closed) plt.figure() open_years.median().plot(style='g'); closed_years.median().plot(style='--', title='Median SNIP score\nGreen = Open access\nRed = Closed access', rot=30, color='r'); clean = pd.DataFrame(open_years.median(), columns=['open']) clean['closed'] = closed_years.median() clean['median_difference'] = clean.closed - clean.open plt.figure() clean.median_difference.plot(title='Median difference (closed median - open median)', rot=30, style='m'); clean matches[['1999 SNIP2', '2011 SNIP2']].describe() matches[['1999 SNIP2', '2011 SNIP2']].median() matches['1999 SNIP2'].hist() open_years_sjr = find_sjr(matches) closed_years_sjr = find_sjr(closed) open_years_sjr.median().plot(style='green'); closed_years_sjr.median().plot(style='--', title='Median SJR score\nGreen=closed journals\nBlue=open journals', rot=30); sjr_diff = closed_years_sjr.median() - open_years_sjr.median() sjr_diff sjr_diff.plot(title='Closed journal SJR advantage over open journal', rot=30, ylim=(.0, .75), style='m'); matches[['1999 SJR2', '2011 SJR2']].describe() country_list = ['United States', 'United Kingdom', 'Germany','Netherlands'] nonlst = pd.DataFrame(impact.Country.value_counts()[4:]) nonlst = list(nonlst.index) oa_big4 = matches[matches.Country_x.isin(country_list)] oa_nonbig4 = matches[matches.Country_x.isin(nonlst)] oa_snip, oa_era1, oa_era2, oa_era3 = select_snip_by_era(oa_big4) non_snip, non_era1, non_era2, non_era3 = select_snip_by_era(oa_nonbig4) oasjr, oasjr1, oasjr2, oasjr3 = select_sjr_by_era(oa_big4) nonsjr, nonsjr1, nonsjr2, nonsjr3 = select_sjr_by_era(oa_nonbig4) median_sjr = pd.DataFrame([oasjr1['2011 SJR2'].median(), oasjr2['2011 SJR2'].median(), oasjr3['2011 SJR2'].median()]) median_sjr_col = median_sjr tmp = median_sjr_col.rename(columns={0: 'Big4 Median SJR'}).T median_sjr_big4 = tmp.rename(columns={0:'1996', 1: '1996-2001', 2:'2002-2011'}) OA_sjr_big4 = median_sjr_big4.T OA_sjr_big4['Non-Big4 mean SJR'] = ([nonsjr1['2011 SJR2'].median(), nonsjr2['2011 SJR2'].median(), nonsjr3['2011 SJR2'].median()]) OA_sjr_big4.T OA_sjr_big4.T.plot(kind='bar', rot=0,title='2011 median SJR of OA journals from Big 4, by start year\n Big 4 = USA, UK, Netherlands & Germany'); median_snip = pd.DataFrame([oa_era1['2011 SNIP2'].median(), oa_era2['2011 SNIP2'].median(), oa_era3['2011 SNIP2'].median()]) median_snip_col = median_snip tmp = median_snip_col.rename(columns={0: 'Big4 Mean SNIP'}).T median_snip_big4 = tmp.rename(columns={0:'1996', 1: '1996-2001', 2:'2002-2011'}) OA_big4 = median_snip_big4.T OA_big4['Non-Big4 median SNIP'] = ([non_era1['2011 SNIP2'].median(), non_era2['2011 SNIP2'].median(), non_era3['2011 SNIP2'].median()]) OA_big4.T OA_big4.T.plot(kind='bar', rot=0,title='Median SNIP of OA journals from Big 4, by start year\n Big 4 = USA, UK, Netherlands & Germany'); big4_closed = closed[closed.Country.isin(country_list)] non_closed = closed[closed.Country.isin(nonlst)] big4_closed_snip = find_snip(big4_closed) non_closed_snip = find_snip(non_closed) big4_closed_sjr = find_sjr(big4_closed) non_closed_sjr = find_sjr(non_closed) plt.figure() sjr_combo = pd.DataFrame([oasjr.median(), nonsjr.median(), big4_closed_sjr.median(), non_closed_sjr.median()]).T sjr_combo = sjr_combo.rename(columns={0: 'Big4 OA', 1: 'Non-Big4 OA', 2: 'Big4 closed', 3: 'Non-Big4 closed'}) sjr_combo.plot(rot=30); plt.show() sjr_combo['OA diff'] = sjr_combo['Big4 closed'] - sjr_combo['Big4 OA'] sjr_combo['closed diff'] = sjr_combo['Non-Big4 closed'] - sjr_combo['Non-Big4 OA'] sjr_combo[['OA diff', 'closed diff']].plot(rot=30, title='SJR difference, OA and closed'); sjr_combo.describe() kind_combo = pd.DataFrame([oa_snip.median(), non_snip.median(), big4_closed_snip.median(), non_closed_snip.median()]).T kind_combo = kind_combo.rename(columns={0: 'Big4 OA', 1: 'non-Big4 OA', 2: 'Big4 closed', 3: 'Non-Big4 closed'}) kind_combo.describe() kind_combo.to_csv("snip_origin_year.csv") kind_combo.plot(rot = 30, title='Median SNIP by year, access status and country of origin'); kind_combo['OA diff'] = kind_combo['Big4 closed'] - kind_combo['Big4 OA'] kind_combo['closed diff'] = kind_combo['Non-Big4 closed'] - kind_combo['non-Big4 OA'] kind_combo[['OA diff', 'closed diff']].plot(rot = 30, title='SNIP difference, OA and closed'); def country_impact(df): impacts = df.groupby(by='Country') pop_countries = pd.DataFrame(impact.Country.value_counts(), columns=['Num_journals_total']) country_median = pd.DataFrame(impacts['2011 SNIP2'].median(), columns=['Median snip']) country_median['Num_journals_total'] = pop_countries.Num_journals_total country_median['Median SJR'] = impacts['2011 SJR2'].median() plt.figure() plt.scatter(x=log(country_median['Num_journals_total']), y=country_median['Median snip']); plt.scatter(x=log(country_median['Num_journals_total']), y=country_median['Median SJR'], c='y', marker='+'); plt.title('Median impact by log of number of journals published in that country\nBlue is SNIP, yellow is SJR') clean = country_median.dropna(how='any') slope, intercept, r_value, p_value, std_err = stats.linregress(clean['Median snip'], clean['Num_journals_total']) print 'SNIP. r = {0}, p = {1}'.format(r_value, p_value) slope1, intercept1, r_value1, p_value1, std_err1 = stats.linregress(clean['Median SJR'], clean['Num_journals_total']) print 'SJR. r = {0}, p = {1}'.format(r_value1, p_value1) return country_median open_access = country_impact(matches) closed_access = country_impact(closed) combo = pd.merge(open_access, closed_access, suffixes=('_open', '_closed'), left_on=open_access.index, right_on=closed_access.index) combo = combo.set_index('key_0') combo['snip_diff'] = combo['Median snip_open'] - combo['Median snip_closed'] combo['sjr_diff'] = combo['Median SJR_open'] - combo['Median SJR_closed'] for_plot = combo[combo['Num_journals_total_open'] >= 150] for_plot.snip_diff.plot(kind='bar', title='Median snip open - median snip closed\n(Negative means closed has higher snip)'); for_plot.sjr_diff.plot(kind='bar', title='SJR'); combo.snip_diff.hist(bins=30); combo.snip_diff.describe() combo.sjr_diff.hist(bins=30); combo.sjr_diff.describe() combo.head() clean = combo.dropna(how='any') slope1, intercept1, r_value1, p_value1, std_err1 = stats.linregress(clean['snip_diff'], clean['Num_journals_total_open']) print r_value1, p_value1 plt.figure() plt.scatter(log(clean['Num_journals_total_open']), clean['snip_diff']);