import pandas as pd def load_jcr_old(year): columns_to_load = ['ISSN', 'Impact Factor', '{' + str(year) + '} Articles'] tbl = pd.read_csv('data/SCIE_JCR{}.csv'.format(year), skiprows=2)[columns_to_load] tbl.columns = ('ISSN', 'IF'+str(year), 'A'+str(year)) return tbl.set_index('ISSN') tbl2000 = load_jcr_old(2000) tbl2000.head() ls data oldtables = [load_jcr_old(y) for y in range(2000, 2013)] from functools import reduce alloldtables = reduce(lambda x, y: pd.merge(x, y, how='outer', left_index=True, right_index=True), oldtables).dropna(subset=('A2012',)) alloldtables.tail() tbl = pd.read_csv('data/2013_IF.csv', skiprows=1, na_values=('Not Available',), thousands=',' ).drop_duplicates()[['Full Journal Title', 'Journal Impact Factor', 'Total Cites']] tbl['Full Journal Title'] = tbl['Full Journal Title'].apply(lambda x: x.upper() if isinstance(x, str) else '') tbl.head() tbl['A2013'] = (tbl['Total Cites'] / tbl['Journal Impact Factor'] / 2).fillna(0).astype(int) tbl['IF2013'] = tbl['Journal Impact Factor'] tbl.head() jcr2013 = tbl[['A2013', 'IF2013', 'Full Journal Title']] category2012 = pd.read_csv('data/SCIE_JCR2012-category.csv', skiprows=2)[[ 'ISSN', 'Subject Category', 'Journal Title (Full)', 'Abbreviated Journal Title']] category2012.head() selected_categories = set(['BIOCHEMICAL RESEARCH METHODS', 'BIOCHEMISTRY & MOLECULAR BIOLOGY','BIOLOGY','BIOTECHNOLOGY & APPLIED MICROBIOLOGY', 'CELL BIOLOGY','BIOPHYSICS','CRYSTALLOGRAPHY', 'DEVELOPMENTAL BIOLOGY','GENETICS & HEREDITY','IMMUNOLOGY','MATHEMATICAL & COMPUTATIONAL BIOLOGY', 'MICROBIOLOGY','MYCOLOGY','MULTIDISCIPLINARY SCIENCES', 'NEUROSCIENCES','PLANT SCIENCES','VIROLOGY']) selectedcategory2012 = category2012[category2012['Subject Category'].isin(selected_categories)] len(category2012), len(selectedcategory2012) merge1 = pd.merge(selectedcategory2012, alloldtables, how='inner', left_on='ISSN', right_index=True) merge2 = pd.merge(merge1, jcr2013, how='inner', left_on='Journal Title (Full)', right_on='Full Journal Title').sort('IF2013', ascending=False) merge2.head() merge2.to_csv('JCRMerged.csv') gtbl = merge2 trend_first_year = 2011 trend_last_year = 2013 trendfit_years = range(trend_first_year, trend_last_year+1) trendfit_if_labels = ['IF{}'.format(y) for y in trendfit_years] for y in trendfit_years: gtbl['LOGIF{}'.format(y)] = np.log2(gtbl['IF{}'.format(y)]) logif_labels = ['LOGIF{}'.format(y) for y in trendfit_years] gtbl['trend_slope'] = gtbl.apply( (lambda x: np.polyfit(trendfit_years, x[logif_labels], 1)[0]), axis=1) gtbl.ix[:, -6:].head() gtbl['trend_slope'].hist(range=(-2, 2), bins=50) gtbl['recent_if'] = ( gtbl[trendfit_if_labels].product(axis=1) ** (1/len(trendfit_years))) full_if_years = np.arange(2003, 2014) full_if_labels = ['IF{}'.format(y) for y in full_if_years] years_zero_centered = full_if_years - full_if_years.mean() xshrink = 0.1 yshrink = 0.1 xremapscale = 0.75 xlabelcutoff = 15 ** xremapscale fig = plt.figure(figsize=(8.5, 7)) plottbl = gtbl[gtbl['recent_if'] > 4] for rowi, row in plottbl.iterrows(): # alignment positions of this subplot xcenter = row['recent_if'] ** xremapscale ytop = row['trend_slope'] ifs = np.array(row[full_if_labels]) nmissing = sum(list(map(np.isnan, ifs))) yoffsets = np.array(ifs / row['recent_if']) * yshrink yheight = yoffsets.max() - yoffsets.min() yoffsets = yoffsets - yoffsets.min() - yheight / 2 ypoints = (yoffsets + ytop)[nmissing:] xpoints = (years_zero_centered * xshrink + xcenter)[nmissing:] plt.plot(xpoints, ypoints, c='black', alpha=0.7) if xcenter > xlabelcutoff or abs(ytop) > 0.2: plt.annotate(row['Abbreviated Journal Title'].title(), (xpoints[-1], ypoints[-1])) plt.axhline(0, c='black', lw=1) plt.ylim(-0.8, 0.8) xtickpositions = np.array([5, 10, 15, 20, 25, 30, 35, 40, 45]) plt.xticks(xtickpositions ** xremapscale, list(map(str, xtickpositions))) plt.grid(True, ls='-', alpha=0.3) plt.xlabel('Mean Impact Factor 2011-2013') plt.ylabel('IF Change Trend 2011-2013') plt.title('All Journals Related to Molecular & Cell Biology') plt.savefig(DROPBOXHOME+'/Data/2014/jcr-trend-all.pdf')