from __future__ import division

import pandas as pd
#pd.set_printoptions(max_rows=100, max_columns=10)
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 10)
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

impact = pd.read_csv('../../SNIP_SJR_complete_1999_2011new_SNIP_and_SJR_v1_Oct_2012.csv')
open_access = pd.read_csv('../data/open_access_journals.csv')

def rm_issn_punc(x):
    import re
    punc = re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ", x)
    space = ''.join(punc.split(" "))
    return space


def strip_space(x):
    try:
        new = int(x)
    except Exception as e:
        new = str(x).strip(' ') 
    
    return new


def membership(x):
    blean = True
    open_lst = np.array(open_access.issn)
    if x in open_lst:
        blean = False
    return blean


open_access['issn_'] = open_access['ISSN'].map(rm_issn_punc)
open_access['issn'] = open_access['issn_'].map(strip_space)

impact['issn'] = impact['Print ISSN'].map(strip_space)

impact.issn = impact.issn.replace('nan', np.nan)
open_access.issn = open_access.issn.replace('nan', np.nan)

impact = impact.drop_duplicates(cols='issn')
open_access = open_access.drop_duplicates(cols='issn')

impact['closed'] = impact['issn'].map(membership)
closed = impact[impact.closed != False]

impact.closed.value_counts()

matches = pd.merge(impact, open_access, left_on=impact['issn'], right_on=['issn']).drop_duplicates()
matches = matches.dropna(how='all')
matches = matches.drop_duplicates(cols='Title')
matches['Country'] = matches['Country_x']
matches[['Source Title', 'Title']].head(20)

print 'Closed: {0}, Open: {1}, Full list: {2}'.format(len(closed.issn.dropna().unique()), len(open_access.issn.dropna().unique()), len(impact.issn.dropna().unique()))

top_impact_all = impact[['Source Title', '2011 SNIP2']].copy()
top_impact_all = pd.DataFrame(top_impact_all.sort('2011 SNIP2', ascending=False).dropna(), columns=['Source Title', '2011 SNIP2'])
top_impact_all['2011 SJR2'] = impact['2011 SJR2']
top_impact_all['Difference'] = top_impact_all['2011 SNIP2'] - top_impact_all['2011 SJR2']

top_impact_all.head(15)

top_impact_all.sort('2011 SJR2', ascending=False).dropna().head(15)

open_lang = open_access.Language.value_counts().head(10)/len(open_access)*100
open_lang

plt.figure()
open_lang.plot(kind='bar', title='Most common languages, open source journals (%)', color='green', alpha=.3);
plt.show()

open_lang = open_access.Keyword.value_counts().head(15)/len(open_access)*100
open_lang

open_lang.plot(kind='bar', title='Most common keywords, open source journals (%)', color='green', alpha=.3);

open_access.to_csv('open_access.csv')

plt.figure()
open_access['Start Year'].hist(range=(1980, 2012), bins=30, color='green', alpha=.3)
plt.title('Histogram of start year, open access journals');

timeline = open_access.sort('Start Year')
timeline[['Title', 'Start Year', 'End Year']].head(10)

fee = open_access['Publication fee'].value_counts()/len(open_access)*100
fee

fee.plot(kind='bar', title='Histogram of fee required (%)', color='green', alpha=.3, rot=0);

len(open_access) - len(matches)

closed_field = closed[['Physical sciences', 'Life sciences', 'Social sciences']].count()/len(closed)
open_field = matches[['Physical sciences', 'Life sciences', 'Social sciences']].count()/len(matches)

closed_field.plot(color='red', kind='bar', alpha=.7);

tmp = pd.DataFrame(open_field, columns=['Open access'])
tmp['Closed access'] = closed_field

plt.figure()
tmp.plot(kind='bar', color=['green', 'red'], alpha=.5, title='Comparison of discipline', rot=0);

countries = pd.DataFrame(closed.Country.value_counts(), columns=['closed'])
countries['open'] = open_access.Country.value_counts()
countries['proportion_oa'] = countries['open']/countries['closed']

countries_sorted = countries.sort('proportion_oa', ascending=False)
countries_sorted[countries_sorted.proportion_oa >= 0][:10]

plt.figure()
countries.closed.head(10).plot(color='red', kind='bar', alpha=.5)
countries.open.head(10).plot(color='green', kind='bar', alpha=.6, title='Comparison of top journal producers (%)\nGreen=Open access, red=Closed access', rot=30);

countries_sorted

countries_sorted.proportion_oa[countries_sorted.proportion_oa > 0].head(10).plot(kind='bar', color ='g', rot=30, alpha=.5,
title ='Countries with highest proportion of OA journals (%)');

snip_dist = pd.DataFrame(closed['2011 SNIP2'], columns=['2011 closed SNIP'])
snip_dist['2011 open SNIP'] = matches['2011 SNIP2']

snip_dist[snip_dist['2011 closed SNIP'] <15].boxplot(sym='m+');

snip_dist.describe()

sjr_dist = pd.DataFrame(closed['2011 SJR2'], columns=['2011 closed SJR'])
sjr_dist['2011 open SJR'] = matches['2011 SJR2']

sjr_dist[sjr_dist['2011 closed SJR']<15].boxplot(sym='m+');

sjr_dist.describe()

def find_snip(db):
    snip_out = db[['1999 SNIP2', '2000 SNIP2', '2001 SNIP2', '2002 SNIP2', '2003 SNIP2', '2004 SNIP2', '2005 SNIP2', '2006 SNIP2',
                      '2007 SNIP2', '2008 SNIP2', '2009 SNIP2', '2010 SNIP2', '2011 SNIP2']]
    
    return snip_out


def select_snip_by_era(country_db):
    snip = find_snip(country_db)
    
    era1 = snip[country_db['Start Year'] < 1996]
    era2 = snip[(country_db['Start Year'] >= 1996) & (country_db['Start Year'] <= 2001)]
    era3 = snip[country_db['Start Year'] > 2001]
    
    return snip, era1, era2, era3


def find_sjr(db):
    sjr_out = db[['1999 SJR2', '2000 SJR2', '2001 SJR2', '2002 SJR2', '2003 SJR2', '2004 SJR2', '2005 SJR2', '2006 SJR2',
                      '2007 SJR2', '2008 SJR2', '2009 SJR2', '2010 SJR2', '2011 SJR2']]
    return sjr_out


def select_sjr_by_era(country_db):
    sjr = find_sjr(country_db)

    era1 = sjr[country_db['Start Year'] < 1996]
    era2 = sjr[(country_db['Start Year'] >= 1996) & (country_db['Start Year'] <= 2001)]
    era3 = sjr[country_db['Start Year'] > 2001]
    
    return sjr, era1, era2, era3

open_years = find_snip(matches)
closed_years = find_snip(closed)

plt.figure()
open_years.median().plot(style='g');
closed_years.median().plot(style='--', title='Median SNIP score\nGreen = Open access\nRed = Closed access', rot=30, color='r');

clean = pd.DataFrame(open_years.median(), columns=['open'])
clean['closed'] = closed_years.median()
clean['median_difference'] = clean.closed - clean.open

plt.figure()
clean.median_difference.plot(title='Median difference (closed median - open median)', rot=30, style='m');

clean

matches[['1999 SNIP2', '2011 SNIP2']].describe()

matches[['1999 SNIP2', '2011 SNIP2']].median()

matches['1999 SNIP2'].hist()

open_years_sjr = find_sjr(matches)
closed_years_sjr = find_sjr(closed)

open_years_sjr.median().plot(style='green');
closed_years_sjr.median().plot(style='--', title='Median SJR score\nGreen=closed journals\nBlue=open journals', rot=30);

sjr_diff = closed_years_sjr.median() - open_years_sjr.median()
sjr_diff

sjr_diff.plot(title='Closed journal SJR advantage over open journal', rot=30, ylim=(.0, .75), style='m');

matches[['1999 SJR2', '2011 SJR2']].describe()

country_list = ['United States', 'United Kingdom', 'Germany','Netherlands']
nonlst = pd.DataFrame(impact.Country.value_counts()[4:])
nonlst = list(nonlst.index)

oa_big4 = matches[matches.Country_x.isin(country_list)]
oa_nonbig4 = matches[matches.Country_x.isin(nonlst)]

oa_snip, oa_era1, oa_era2, oa_era3 = select_snip_by_era(oa_big4)
non_snip, non_era1, non_era2, non_era3 = select_snip_by_era(oa_nonbig4)

oasjr, oasjr1, oasjr2, oasjr3 = select_sjr_by_era(oa_big4)
nonsjr, nonsjr1, nonsjr2, nonsjr3 = select_sjr_by_era(oa_nonbig4)

median_sjr = pd.DataFrame([oasjr1['2011 SJR2'].median(), oasjr2['2011 SJR2'].median(), oasjr3['2011 SJR2'].median()])
median_sjr_col = median_sjr
tmp = median_sjr_col.rename(columns={0: 'Big4 Median SJR'}).T
median_sjr_big4 = tmp.rename(columns={0:'1996', 1: '1996-2001', 2:'2002-2011'})

OA_sjr_big4 = median_sjr_big4.T
OA_sjr_big4['Non-Big4 mean SJR'] = ([nonsjr1['2011 SJR2'].median(), nonsjr2['2011 SJR2'].median(), nonsjr3['2011 SJR2'].median()])
OA_sjr_big4.T

OA_sjr_big4.T.plot(kind='bar', rot=0,title='2011 median SJR of OA journals from Big 4, by start year\n Big 4 = USA, UK, Netherlands & Germany');

median_snip = pd.DataFrame([oa_era1['2011 SNIP2'].median(), oa_era2['2011 SNIP2'].median(), oa_era3['2011 SNIP2'].median()])
median_snip_col = median_snip
tmp = median_snip_col.rename(columns={0: 'Big4 Mean SNIP'}).T
median_snip_big4 = tmp.rename(columns={0:'1996', 1: '1996-2001', 2:'2002-2011'})

OA_big4 = median_snip_big4.T
OA_big4['Non-Big4 median SNIP'] = ([non_era1['2011 SNIP2'].median(), non_era2['2011 SNIP2'].median(), non_era3['2011 SNIP2'].median()])
OA_big4.T

OA_big4.T.plot(kind='bar', rot=0,title='Median SNIP of OA journals from Big 4, by start year\n Big 4 = USA, UK, Netherlands & Germany');

big4_closed = closed[closed.Country.isin(country_list)]
non_closed = closed[closed.Country.isin(nonlst)]

big4_closed_snip = find_snip(big4_closed)
non_closed_snip = find_snip(non_closed)

big4_closed_sjr = find_sjr(big4_closed)
non_closed_sjr = find_sjr(non_closed)

plt.figure()
sjr_combo = pd.DataFrame([oasjr.median(), nonsjr.median(), big4_closed_sjr.median(), non_closed_sjr.median()]).T
sjr_combo = sjr_combo.rename(columns={0: 'Big4 OA', 1: 'Non-Big4 OA', 2: 'Big4 closed', 3: 'Non-Big4 closed'})
sjr_combo.plot(rot=30);
plt.show()

sjr_combo['OA diff'] = sjr_combo['Big4 closed'] - sjr_combo['Big4 OA']
sjr_combo['closed diff'] = sjr_combo['Non-Big4 closed'] - sjr_combo['Non-Big4 OA']
sjr_combo[['OA diff', 'closed diff']].plot(rot=30, title='SJR difference, OA and closed');

sjr_combo.describe()

kind_combo = pd.DataFrame([oa_snip.median(), non_snip.median(), big4_closed_snip.median(), non_closed_snip.median()]).T
kind_combo = kind_combo.rename(columns={0: 'Big4 OA', 1: 'non-Big4 OA', 2: 'Big4 closed', 3: 'Non-Big4 closed'})

kind_combo.describe()

kind_combo.to_csv("snip_origin_year.csv")

kind_combo.plot(rot = 30, title='Median SNIP by year, access status and country of origin');

kind_combo['OA diff'] = kind_combo['Big4 closed'] - kind_combo['Big4 OA']
kind_combo['closed diff'] = kind_combo['Non-Big4 closed'] - kind_combo['non-Big4 OA']
kind_combo[['OA diff', 'closed diff']].plot(rot = 30, title='SNIP difference, OA and closed');

def country_impact(df):

    impacts = df.groupby(by='Country')
    pop_countries = pd.DataFrame(impact.Country.value_counts(), columns=['Num_journals_total'])
    country_median = pd.DataFrame(impacts['2011 SNIP2'].median(), columns=['Median snip'])
    country_median['Num_journals_total'] = pop_countries.Num_journals_total
    country_median['Median SJR'] = impacts['2011 SJR2'].median()
    
    plt.figure()
    plt.scatter(x=log(country_median['Num_journals_total']), y=country_median['Median snip']);
    plt.scatter(x=log(country_median['Num_journals_total']), y=country_median['Median SJR'], c='y', marker='+');
    plt.title('Median impact by log of number of journals published in that country\nBlue is SNIP, yellow is SJR')
    
    clean = country_median.dropna(how='any')
    slope, intercept, r_value, p_value, std_err = stats.linregress(clean['Median snip'], clean['Num_journals_total'])
    print 'SNIP. r = {0}, p = {1}'.format(r_value, p_value)
    
    slope1, intercept1, r_value1, p_value1, std_err1 = stats.linregress(clean['Median SJR'], clean['Num_journals_total'])
    print 'SJR. r = {0}, p = {1}'.format(r_value1, p_value1)
    
    return country_median

open_access = country_impact(matches)

closed_access = country_impact(closed)

combo = pd.merge(open_access, closed_access, suffixes=('_open', '_closed'), left_on=open_access.index, right_on=closed_access.index)
combo = combo.set_index('key_0')
combo['snip_diff'] = combo['Median snip_open'] - combo['Median snip_closed']
combo['sjr_diff'] = combo['Median SJR_open'] - combo['Median SJR_closed']
for_plot = combo[combo['Num_journals_total_open'] >= 150]

for_plot.snip_diff.plot(kind='bar', title='Median snip open - median snip closed\n(Negative means closed has higher snip)');

for_plot.sjr_diff.plot(kind='bar', title='SJR');

combo.snip_diff.hist(bins=30);

combo.snip_diff.describe()

combo.sjr_diff.hist(bins=30);

combo.sjr_diff.describe()

combo.head()

clean = combo.dropna(how='any')
slope1, intercept1, r_value1, p_value1, std_err1 = stats.linregress(clean['snip_diff'], clean['Num_journals_total_open'])
print r_value1, p_value1

plt.figure()
plt.scatter(log(clean['Num_journals_total_open']), clean['snip_diff']);