In [27]:

#url='http://weather-warehouse.com/WeatherHistory/PastWeatherData_IthacaCornellUniv_Ithaca_NY_'+m+'.html'#get data files and store them
#from urllib2 import urlopen
months =( 'January','February','March','April','May','June',
         'July','August','September','October','November','December')
#rawpage={}
#for m in months:
   #rawpage[m]=urlopen(url).read()
   #with open('wdata/'+m+'.html','w') as f: f.write(rawpage[m])

In [24]:

#<table class='stripeMe' id='myTable'><thead><tr><th><span id='year' title='<br>Not all weather stations...
#<tr><td>2011</td><td>3</td><td>60</td><td>37</td><td>26</td><td>22.4</td><td>39.8</td><td>31.1</td><td>3.63</td><td>19.30</td><td>1.29</td><td>15.00</td></tr>
#0:year, 1:lotemp, 2:hitemp, 3:himin, 4:lomax, 5:avgmin, 6:avgmax, 7:mean, 8:precip, 9:snow, 10:maxprecip, 11:maxsnow

import re

def getdata(file):
  data=[]  

  skipped=[]  
  for l in open(file).readlines():
    s=l.rstrip().split('</td><td>')  #split the table entries
    if len(s) == 1: continue
    if s[0].find('The official station') > -1: continue
    if s[0].startswith('<table'):
      header,s[0] = s[0].split('<tr><td>')
      ids = re.findall("id='(.*?)'",header)[1:]
      titles = re.findall("title='<br>(.*?)\.?'",header)
  
    if s[0].startswith('<tr><td>'): s[0]=s[0][8:]
    if s[-1].endswith('</td></tr>'): s[-1]=s[-1][:-10]
    #skip entire year if missing data
    if '' in s:
      skipped.append(s[0])
      continue
    data += [ [int(s[0])] + map(float,s[1:]) ]
  
  print file,'skipping:',' '.join(skipped)
#  print ', '.join([str(i)+':'+ids[i] for i in range(len(ids))])
  return data
# data returned as a list of columns [[1900 .. 2012],[11.0,6.0, ...], ... ]

In [17]:

##http://weather-warehouse.com/WeatherHistory/PastWeatherData_IthacaCornellUniv_Ithaca_NY_March.html
data=getdata('wdata/March.html')

skipping: 1953 1928 1925 1924 1923 1922 1921 1920 1919
0:year, 1:lotemp, 2:hitemp, 3:himin, 4:lomax, 5:avgmin, 6:avgmax, 7:mean, 8:precip, 9:snow, 10:maxprecip, 11:maxsnow

In [18]:

from scipy import stats
#r,p=stats.pearsonr(xdata,ydata)
#slope, intercept, r_value, p_value, std_err = stats.linregress(xdata,ydata)

In [25]:

#files in subdirectory, e.g., wdata/March.html
yd={}
for m in months:
    data=getdata('wdata/'+m+'.html')
    yd[m]= [[yr[j] for yr in data] for j in range(12)]

wdata/January.html skipping: 1953 1945 1928 1926 1925 1924 1923 1922 1921 1920 1919 1918
wdata/February.html skipping: 1953 1945 1925 1924 1923 1922 1921 1920 1919
wdata/March.html skipping: 1953 1928 1925 1924 1923 1922 1921 1920 1919
wdata/April.html skipping: 1953 1925 1924 1923 1922 1921 1920 1919
wdata/May.html skipping: 1980 1953 1925 1924 1923 1922 1921 1920 1919 1918 1916 1915 1913 1904 1901
wdata/June.html skipping: 2002 1995 1994 1925 1924 1923 1922 1921 1920 1919 1918 1917 1916 1915 1914 1913 1912 1911 1910 1909 1908 1907 1906 1904 1903 1901 1900
wdata/July.html skipping: 1995 1994 1953 1925 1924 1923 1922 1921 1920 1919 1918 1917 1916 1915 1914 1913 1912 1911 1910 1909 1908 1907 1906 1905 1904 1903 1902 1901
wdata/August.html skipping: 1995 1925 1924 1923 1922 1921 1920 1919 1918 1917 1916 1915 1914 1913 1912 1911 1910 1909 1908 1907 1906 1905 1904 1903 1902 1901
wdata/September.html skipping: 1995 1925 1924 1923 1922 1921 1920 1919 1918 1917 1916 1915 1914 1913 1912 1911 1910 1909 1908 1907 1906 1905 1904 1903 1902 1901
wdata/October.html skipping: 1997 1953 1952 1925 1924 1923 1922 1921 1920 1919 1918 1916 1915 1912
wdata/November.html skipping: 1992 1925 1924 1923 1922 1921 1920 1919 1918 1917
wdata/December.html skipping: 2012 1997 1977 1955 1952 1944 1925 1924 1923 1922 1921 1920 1919 1918 1906

In [35]:

#find all the pearson correlations and slopes for temps
cor={}
for m in months:
    cor[m]=[]
    for j in range(1,8):
      slope, intercept, r_value, p_value, std_err = stats.linregress(yd[m][0],yd[m][j])
      cor[m].append(r_value)
    

In [38]:

#find the biggest correlations
[(m,i+1,cor[m][i]) for m in cor for i in range(len(cor[m])) if cor[m][i] > .15]
#0:year, 1:lotemp, 2:hitemp, 3:himin, 4:lomax, 5:avgmin, 6:avgmax, 7:mean

Out[38]:

[('February', 4, 0.23426362290672637),
 ('May', 4, 0.20071044420385162),
 ('November', 4, 0.18258718937579549)]

In [30]:

xdata=yd['February'][0]
ydata=yd['February'][4]
slope, intercept, r_value, p_value, std_err = stats.linregress(xdata,ydata)
print r_value, p_value, std_err
line = slope*array(xdata)+intercept
plot(xdata,ydata,'go',xdata,line,'r-')
xlim(1900,2013)
grid('on')

0.234263622907 0.0166856021944 0.0194241136681

In [31]:

figure(figsize=(6,9))
#pick Aug as half year away, to compare
xdata=yd['August'][0]
ydata=yd['August'][4]
slope, intercept, r_value, p_value, std_err = stats.linregress(xdata,ydata)
print r_value, p_value, std_err
line = slope*array(xdata)+intercept
plot(xdata,ydata,'o',xdata,line,'m-')

xdata=yd['February'][0]
ydata=yd['February'][4]
slope, intercept, r_value, p_value, std_err = stats.linregress(xdata,ydata)
print r_value, p_value, std_err
line = slope*array(xdata)+intercept
plot(xdata,ydata,'o',xdata,line,'r-')

yticks(linspace(-10,80,19,endpoint=True))
xlim(1900,2013)
legend(['Aug low max','r=.04, p=.74',
        'Feb low max','r=.23, p=.017'],
       'center left')
grid('on')

0.0362270811744 0.739039358989 0.0167092234008
0.234263622907 0.0166856021944 0.0194241136681

In [39]:

#find all the pearson correlations and slopes for rain, snow
cor={}
for m in months:
    cor[m]=[]
    for j in range(8,12):
      slope, intercept, r_value, p_value, std_err = stats.linregress(yd[m][0],yd[m][j])
      cor[m].append(r_value)

In [40]:

#find large correl
[(m,i+8,cor[m][i]) for m in cor for i in range(len(cor[m])) if cor[m][i] > .2]
#8:precip, 9:snow, 10:maxprecip, 11:maxsnow,

Out[40]:

[('January', 9, 0.27200571430585363),
 ('September', 8, 0.28863588757748476),
 ('September', 10, 0.30524866894931441),
 ('April', 10, 0.25402483790029201),
 ('November', 8, 0.2543321932154195),
 ('November', 10, 0.25394514976856897)]

In [41]:

xdata=yd['September'][0]
ydata=yd['September'][8]
slope, intercept, r_value, p_value, std_err = stats.linregress(xdata,ydata)
print r_value, p_value, std_err
line = slope*array(xdata)+intercept
plot(xdata,ydata,'o',xdata,line,'m-')

ydata=yd['September'][10]
slope, intercept, r_value, p_value, std_err = stats.linregress(xdata,ydata)
print r_value, p_value, std_err
line = slope*array(xdata)+intercept
plot(xdata,ydata,'o',xdata,line,'r-')

grid('on')
xlim(1900,2013)
legend(['Sep precip','correl=.29, p=.007','Sep maxprecip','correl=.31, p=.004'],'upper left')

0.288635887577 0.00670323007409 0.00744376536802
0.305248668949 0.00404100808359 0.00292651925232

Out[41]:

<matplotlib.legend.Legend at 0x10a54ff10>

In [42]:

#find large anti-correl
[(m,i+8,cor[m][i]) for m in cor for i in range(len(cor[m])) if cor[m][i] < -.1]

Out[42]:

[('May', 9, -0.11930597118862424),
 ('May', 11, -0.16533094342645063),
 ('June', 10, -0.11391434235167339),
 ('April', 9, -0.12608411789576421),
 ('April', 11, -0.11136532612725898),
 ('July', 10, -0.13166601569417957)]

In [43]:

xdata=yd['April'][0]
ydata=yd['April'][9]
slope, intercept, r_value, p_value, std_err = stats.linregress(xdata,ydata)
print r_value, p_value, std_err
line = slope*array(xdata)+intercept
plot(xdata,ydata,'o',xdata,line,'m-')

ydata=yd['April'][11]
slope, intercept, r_value, p_value, std_err = stats.linregress(xdata,ydata)
print r_value, p_value, std_err
line = slope*array(xdata)+intercept
plot(xdata,ydata,'o',xdata,line,'r-')

grid('on')
xlim(1900,2013)
legend(['April snow','correl=-.13, p=.2','April maxsnow','correl=-.11, p=.26'])

-0.126084117896 0.199971217485 0.0119969352662
-0.111365326127 0.25804788757 0.00681571189758

Out[43]:

<matplotlib.legend.Legend at 0x10a134410>

In [44]:

yrdata = {yr:[] for yr in range(1900,2013) if yr not in range(1919,1926)}
for data in [yd[m] for m in months]:
  yrs = data[0]
  avg = data[7]
  for i in range(len(yrs)): yrdata[yrs[i]].append(avg[i])

In [46]:

xdata=[yr for yr in yrdata if len(yrdata[yr]) ==12]
ydata=[mean(yrdata[yr]) for yr in yrdata if len(yrdata[yr]) ==12]
slope, intercept, r_value, p_value, std_err = stats.linregress(xdata,ydata)
print r_value, p_value, std_err
line = slope*array(xdata)+intercept
plot(xdata,ydata,'o',xdata,line,'r-')
xlim(1926,2012)
grid('on')

-0.295308262299 0.0117904147382 0.0065951343345

In [47]:

yrdata[2011]

Out[47]:

[20.2, 22.2, 31.1, 45.8, 58.1, 66.1, 71.9, 67.5, 63.0, 50.2, 44.9, 33.2]

In [48]:

yrdata[1990]

Out[48]:

[32.4, 29.3, 37.2, 47.1, 53.8, 65.5, 69.0, 67.1, 59.2, 51.8, 41.2, 32.8]

In [ ]: