import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import d3py

impact = pd.read_csv('SNIP_SJR_complete_1999_2011new_SNIP_and_SJR_v1_Oct_2012.csv')

def count_words(title):
    wordCount = sum([1 for word in title.split(' ')])
        
    return wordCount

def impact_regression(df):
    clean = df[['Source Title', '2011 SNIP2', 'title_wordcnt']].dropna(how='any')
    slope, intercept, r_value, p_value, std_err = stats.linregress(clean['title_wordcnt'], clean['2011 SNIP2'])
    print 'Rvalue: {0}\nPvalue: {1}\n\n'.format(r_value, p_value)
    
    plt.scatter(clean['title_wordcnt'], clean['2011 SNIP2']);

impact['title_wordcnt'] = impact['Source Title'].map(count_words)

impact.to_csv('impact_titlelength.csv')

big4.title_wordcnt.describe()

big4['2011 SNIP2'].describe()

big4[big4['2011 SNIP2'] >=30][['2011 SNIP2', 'Source Title']]

impact['2011 SJR2'].clip(0, 5).hist(bins=50, color='MediumAquamarine');
plt.title('Histogram of 2011 SJR scores');
plt.xlabel('SJR score');
plt.ylabel('Counts');

impact.title_wordcnt.clip(1, 10).hist(bins=10, color='m', alpha=.4);
plt.title('Title length')
plt.xlabel('Number of words in title')
plt.ylabel('Count');

impact_regression(impact)

usa = impact[impact.Country == 'United States']

impact_regression(usa)

impact.Country.value_counts()[:10]

country_list = ['United States', 'United Kingdom', 'Netherlands', 'Germany']
big4 = impact[impact.Country.isin(country_list)]

impact_regression(big4)

uk = impact[impact.Country == 'United Kingdom']
impact_regression(uk)

neth = impact[impact.Country == 'Netherlands']
impact_regression(neth)

germ = impact[impact.Country == 'Germany']
impact_regression(germ)

simplified = big4[['Country', 'title_wordcnt', '2011 SNIP2']]
simpl = simplified.groupby(by=tmp.Country)
simplified.to_csv('big4_titlelength.csv')


big4

simpl.describe()