import pandas as pd
from scipy import stats
pd.set_option('display.mpl_style', 'default') # nicer plots than the default matplotlib style

fileurl = 'https://dl.dropboxusercontent.com/u/25694950/channel_sinuosities.csv'
data = pd.read_csv(fileurl)

data[:20]

figure(figsize=(10,8))
data['Name'].value_counts().plot(kind='bar');

figure(figsize=(10,8))
groups = data.groupby('Large_river')
labels = ('no large river','large river')
colors = (plt.rcParams['axes.color_cycle'][4],plt.rcParams['axes.color_cycle'][0])
for name, group in groups:
    plot(group.Slope, group.Sinuosity, marker='o', linestyle='', ms=8, label=labels[name], color=colors[name])
legend(loc='best')
xlabel('valley slope', fontsize=18)
ylabel('sinuosity', fontsize=18)
axis([0.0,0.03,1.0,3.0]);

figure(figsize=(10,8))
groups = data.groupby('Large_river')
labels = ('no large river','large river')
colors = (plt.rcParams['axes.color_cycle'][4],plt.rcParams['axes.color_cycle'][0])
for name, group in groups:
    plot(group.Latitude, group.Sinuosity, marker='o', linestyle='', ms=8, label=labels[name], color=colors[name])
legend(loc='best')
xlabel('latitude', fontsize=18)
ylabel('sinuosity', fontsize=18)
axis([0.0,70,1.0,3.0]);

def bootstrap_regression(x,y,nit):
    slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
    r = np.zeros(nit)
    n = len(x)
    for i in range(1,nit):
        x1 = x[np.random.randint(n, size=n)]
        y1 = y[np.random.randint(n, size=n)]
        slope, intercept, r_value2, p_value2, std_err = stats.linregress(x1,y1)
        r[i] = r_value2
    if r_value>0:
        p_value_boot = len(r[r >= r_value])/float(len(r))
    else:
        p_value_boot = len(r[r <= r_value])/float(len(r))
    return r_value, p_value, p_value_boot

r_values = np.zeros([3,1])
p_values = np.zeros([3,1])
p_values_boot = np.zeros([3,1])
r_values[0], p_values[0], p_values_boot[0] = bootstrap_regression(data.Latitude,data.Sinuosity,1000)
# for variable pairs that involve slope, we can only use those data points that have slope measurements:
r_values[1], p_values[1], p_values_boot[1] = bootstrap_regression(data.Slope[isnan(data.Slope)==0],data.Sinuosity[isnan(data.Slope)==0],1000)
r_values[2], p_values[2], p_values_boot[2] = bootstrap_regression(data.Latitude[isnan(data.Slope)==0],data.Slope[isnan(data.Slope)==0],1000)

df = pd.DataFrame(np.hstack(([[292],[268],[268]],r_values,r_values**2,
                             p_values,p_values_boot)),columns=['n','Pearson R','R squared','p-value',
                            'p-value bootstrap'],index=('latitude and sinuosity','slope and sinuosity','latitude and slope'))

df