#url='http://weather-warehouse.com/WeatherHistory/PastWeatherData_IthacaCornellUniv_Ithaca_NY_'+m+'.html'#get data files and store them
#from urllib2 import urlopen
months =( 'January','February','March','April','May','June',
'July','August','September','October','November','December')
#rawpage={}
#for m in months:
#rawpage[m]=urlopen(url).read()
#with open('wdata/'+m+'.html','w') as f: f.write(rawpage[m])
#<table class='stripeMe' id='myTable'><thead><tr><th><span id='year' title='<br>Not all weather stations...
#<tr><td>2011</td><td>3</td><td>60</td><td>37</td><td>26</td><td>22.4</td><td>39.8</td><td>31.1</td><td>3.63</td><td>19.30</td><td>1.29</td><td>15.00</td></tr>
#0:year, 1:lotemp, 2:hitemp, 3:himin, 4:lomax, 5:avgmin, 6:avgmax, 7:mean, 8:precip, 9:snow, 10:maxprecip, 11:maxsnow
import re
def getdata(file):
data=[]
skipped=[]
for l in open(file).readlines():
s=l.rstrip().split('</td><td>') #split the table entries
if len(s) == 1: continue
if s[0].find('The official station') > -1: continue
if s[0].startswith('<table'):
header,s[0] = s[0].split('<tr><td>')
ids = re.findall("id='(.*?)'",header)[1:]
titles = re.findall("title='<br>(.*?)\.?'",header)
if s[0].startswith('<tr><td>'): s[0]=s[0][8:]
if s[-1].endswith('</td></tr>'): s[-1]=s[-1][:-10]
#skip entire year if missing data
if '' in s:
skipped.append(s[0])
continue
data += [ [int(s[0])] + map(float,s[1:]) ]
print file,'skipping:',' '.join(skipped)
# print ', '.join([str(i)+':'+ids[i] for i in range(len(ids))])
return data
# data returned as a list of columns [[1900 .. 2012],[11.0,6.0, ...], ... ]
##http://weather-warehouse.com/WeatherHistory/PastWeatherData_IthacaCornellUniv_Ithaca_NY_March.html
data=getdata('wdata/March.html')
skipping: 1953 1928 1925 1924 1923 1922 1921 1920 1919 0:year, 1:lotemp, 2:hitemp, 3:himin, 4:lomax, 5:avgmin, 6:avgmax, 7:mean, 8:precip, 9:snow, 10:maxprecip, 11:maxsnow
from scipy import stats
#r,p=stats.pearsonr(xdata,ydata)
#slope, intercept, r_value, p_value, std_err = stats.linregress(xdata,ydata)
#files in subdirectory, e.g., wdata/March.html
yd={}
for m in months:
data=getdata('wdata/'+m+'.html')
yd[m]= [[yr[j] for yr in data] for j in range(12)]
wdata/January.html skipping: 1953 1945 1928 1926 1925 1924 1923 1922 1921 1920 1919 1918 wdata/February.html skipping: 1953 1945 1925 1924 1923 1922 1921 1920 1919 wdata/March.html skipping: 1953 1928 1925 1924 1923 1922 1921 1920 1919 wdata/April.html skipping: 1953 1925 1924 1923 1922 1921 1920 1919 wdata/May.html skipping: 1980 1953 1925 1924 1923 1922 1921 1920 1919 1918 1916 1915 1913 1904 1901 wdata/June.html skipping: 2002 1995 1994 1925 1924 1923 1922 1921 1920 1919 1918 1917 1916 1915 1914 1913 1912 1911 1910 1909 1908 1907 1906 1904 1903 1901 1900 wdata/July.html skipping: 1995 1994 1953 1925 1924 1923 1922 1921 1920 1919 1918 1917 1916 1915 1914 1913 1912 1911 1910 1909 1908 1907 1906 1905 1904 1903 1902 1901 wdata/August.html skipping: 1995 1925 1924 1923 1922 1921 1920 1919 1918 1917 1916 1915 1914 1913 1912 1911 1910 1909 1908 1907 1906 1905 1904 1903 1902 1901 wdata/September.html skipping: 1995 1925 1924 1923 1922 1921 1920 1919 1918 1917 1916 1915 1914 1913 1912 1911 1910 1909 1908 1907 1906 1905 1904 1903 1902 1901 wdata/October.html skipping: 1997 1953 1952 1925 1924 1923 1922 1921 1920 1919 1918 1916 1915 1912 wdata/November.html skipping: 1992 1925 1924 1923 1922 1921 1920 1919 1918 1917 wdata/December.html skipping: 2012 1997 1977 1955 1952 1944 1925 1924 1923 1922 1921 1920 1919 1918 1906
#find all the pearson correlations and slopes for temps
cor={}
for m in months:
cor[m]=[]
for j in range(1,8):
slope, intercept, r_value, p_value, std_err = stats.linregress(yd[m][0],yd[m][j])
cor[m].append(r_value)
#find the biggest correlations
[(m,i+1,cor[m][i]) for m in cor for i in range(len(cor[m])) if cor[m][i] > .15]
#0:year, 1:lotemp, 2:hitemp, 3:himin, 4:lomax, 5:avgmin, 6:avgmax, 7:mean
[('February', 4, 0.23426362290672637), ('May', 4, 0.20071044420385162), ('November', 4, 0.18258718937579549)]
xdata=yd['February'][0]
ydata=yd['February'][4]
slope, intercept, r_value, p_value, std_err = stats.linregress(xdata,ydata)
print r_value, p_value, std_err
line = slope*array(xdata)+intercept
plot(xdata,ydata,'go',xdata,line,'r-')
xlim(1900,2013)
grid('on')
0.234263622907 0.0166856021944 0.0194241136681
figure(figsize=(6,9))
#pick Aug as half year away, to compare
xdata=yd['August'][0]
ydata=yd['August'][4]
slope, intercept, r_value, p_value, std_err = stats.linregress(xdata,ydata)
print r_value, p_value, std_err
line = slope*array(xdata)+intercept
plot(xdata,ydata,'o',xdata,line,'m-')
xdata=yd['February'][0]
ydata=yd['February'][4]
slope, intercept, r_value, p_value, std_err = stats.linregress(xdata,ydata)
print r_value, p_value, std_err
line = slope*array(xdata)+intercept
plot(xdata,ydata,'o',xdata,line,'r-')
yticks(linspace(-10,80,19,endpoint=True))
xlim(1900,2013)
legend(['Aug low max','r=.04, p=.74',
'Feb low max','r=.23, p=.017'],
'center left')
grid('on')
0.0362270811744 0.739039358989 0.0167092234008 0.234263622907 0.0166856021944 0.0194241136681
#find all the pearson correlations and slopes for rain, snow
cor={}
for m in months:
cor[m]=[]
for j in range(8,12):
slope, intercept, r_value, p_value, std_err = stats.linregress(yd[m][0],yd[m][j])
cor[m].append(r_value)
#find large correl
[(m,i+8,cor[m][i]) for m in cor for i in range(len(cor[m])) if cor[m][i] > .2]
#8:precip, 9:snow, 10:maxprecip, 11:maxsnow,
[('January', 9, 0.27200571430585363), ('September', 8, 0.28863588757748476), ('September', 10, 0.30524866894931441), ('April', 10, 0.25402483790029201), ('November', 8, 0.2543321932154195), ('November', 10, 0.25394514976856897)]
xdata=yd['September'][0]
ydata=yd['September'][8]
slope, intercept, r_value, p_value, std_err = stats.linregress(xdata,ydata)
print r_value, p_value, std_err
line = slope*array(xdata)+intercept
plot(xdata,ydata,'o',xdata,line,'m-')
ydata=yd['September'][10]
slope, intercept, r_value, p_value, std_err = stats.linregress(xdata,ydata)
print r_value, p_value, std_err
line = slope*array(xdata)+intercept
plot(xdata,ydata,'o',xdata,line,'r-')
grid('on')
xlim(1900,2013)
legend(['Sep precip','correl=.29, p=.007','Sep maxprecip','correl=.31, p=.004'],'upper left')
0.288635887577 0.00670323007409 0.00744376536802 0.305248668949 0.00404100808359 0.00292651925232
<matplotlib.legend.Legend at 0x10a54ff10>
#find large anti-correl
[(m,i+8,cor[m][i]) for m in cor for i in range(len(cor[m])) if cor[m][i] < -.1]
[('May', 9, -0.11930597118862424), ('May', 11, -0.16533094342645063), ('June', 10, -0.11391434235167339), ('April', 9, -0.12608411789576421), ('April', 11, -0.11136532612725898), ('July', 10, -0.13166601569417957)]
xdata=yd['April'][0]
ydata=yd['April'][9]
slope, intercept, r_value, p_value, std_err = stats.linregress(xdata,ydata)
print r_value, p_value, std_err
line = slope*array(xdata)+intercept
plot(xdata,ydata,'o',xdata,line,'m-')
ydata=yd['April'][11]
slope, intercept, r_value, p_value, std_err = stats.linregress(xdata,ydata)
print r_value, p_value, std_err
line = slope*array(xdata)+intercept
plot(xdata,ydata,'o',xdata,line,'r-')
grid('on')
xlim(1900,2013)
legend(['April snow','correl=-.13, p=.2','April maxsnow','correl=-.11, p=.26'])
-0.126084117896 0.199971217485 0.0119969352662 -0.111365326127 0.25804788757 0.00681571189758
<matplotlib.legend.Legend at 0x10a134410>
yrdata = {yr:[] for yr in range(1900,2013) if yr not in range(1919,1926)}
for data in [yd[m] for m in months]:
yrs = data[0]
avg = data[7]
for i in range(len(yrs)): yrdata[yrs[i]].append(avg[i])
xdata=[yr for yr in yrdata if len(yrdata[yr]) ==12]
ydata=[mean(yrdata[yr]) for yr in yrdata if len(yrdata[yr]) ==12]
slope, intercept, r_value, p_value, std_err = stats.linregress(xdata,ydata)
print r_value, p_value, std_err
line = slope*array(xdata)+intercept
plot(xdata,ydata,'o',xdata,line,'r-')
xlim(1926,2012)
grid('on')
-0.295308262299 0.0117904147382 0.0065951343345
yrdata[2011]
[20.2, 22.2, 31.1, 45.8, 58.1, 66.1, 71.9, 67.5, 63.0, 50.2, 44.9, 33.2]
yrdata[1990]
[32.4, 29.3, 37.2, 47.1, 53.8, 65.5, 69.0, 67.1, 59.2, 51.8, 41.2, 32.8]