import pandas as pd from scipy import stats pd.set_option('display.mpl_style', 'default') # nicer plots than the default matplotlib style fileurl = 'https://dl.dropboxusercontent.com/u/25694950/channel_sinuosities.csv' data = pd.read_csv(fileurl) data[:20] figure(figsize=(10,8)) data['Name'].value_counts().plot(kind='bar'); figure(figsize=(10,8)) groups = data.groupby('Large_river') labels = ('no large river','large river') colors = (plt.rcParams['axes.color_cycle'][4],plt.rcParams['axes.color_cycle'][0]) for name, group in groups: plot(group.Slope, group.Sinuosity, marker='o', linestyle='', ms=8, label=labels[name], color=colors[name]) legend(loc='best') xlabel('valley slope', fontsize=18) ylabel('sinuosity', fontsize=18) axis([0.0,0.03,1.0,3.0]); figure(figsize=(10,8)) groups = data.groupby('Large_river') labels = ('no large river','large river') colors = (plt.rcParams['axes.color_cycle'][4],plt.rcParams['axes.color_cycle'][0]) for name, group in groups: plot(group.Latitude, group.Sinuosity, marker='o', linestyle='', ms=8, label=labels[name], color=colors[name]) legend(loc='best') xlabel('latitude', fontsize=18) ylabel('sinuosity', fontsize=18) axis([0.0,70,1.0,3.0]); def bootstrap_regression(x,y,nit): slope, intercept, r_value, p_value, std_err = stats.linregress(x, y) r = np.zeros(nit) n = len(x) for i in range(1,nit): x1 = x[np.random.randint(n, size=n)] y1 = y[np.random.randint(n, size=n)] slope, intercept, r_value2, p_value2, std_err = stats.linregress(x1,y1) r[i] = r_value2 if r_value>0: p_value_boot = len(r[r >= r_value])/float(len(r)) else: p_value_boot = len(r[r <= r_value])/float(len(r)) return r_value, p_value, p_value_boot r_values = np.zeros([3,1]) p_values = np.zeros([3,1]) p_values_boot = np.zeros([3,1]) r_values[0], p_values[0], p_values_boot[0] = bootstrap_regression(data.Latitude,data.Sinuosity,1000) # for variable pairs that involve slope, we can only use those data points that have slope measurements: r_values[1], p_values[1], p_values_boot[1] = bootstrap_regression(data.Slope[isnan(data.Slope)==0],data.Sinuosity[isnan(data.Slope)==0],1000) r_values[2], p_values[2], p_values_boot[2] = bootstrap_regression(data.Latitude[isnan(data.Slope)==0],data.Slope[isnan(data.Slope)==0],1000) df = pd.DataFrame(np.hstack(([[292],[268],[268]],r_values,r_values**2, p_values,p_values_boot)),columns=['n','Pearson R','R squared','p-value', 'p-value bootstrap'],index=('latitude and sinuosity','slope and sinuosity','latitude and slope')) df