Load monthly, historical precipitation data from the California Department of Water Resources. The tables are linked from http://cdec.water.ca.gov/cgi-progs/prevprecip/PRECIPOUT, e.g. http://cdec.water.ca.gov/cgi-progs/reports/PRECIPOUT.2011.
import lxml.html
import numpy as np
import pandas as pd
# constants
REGION = 'region'
SUBREGION = 'subregion'
DATAROW = 'datarow'
Get some example rows for testing functions later.
h = lxml.html.parse('http://cdec.water.ca.gov/cgi-progs/reports/PRECIPOUT.2011')
table = h.xpath('//*[@id="main_content"]/div/div[1]/table')[0]
rows = table.getchildren()
region_row = rows[0]
subregion_row = rows[2]
station_row = rows[3]
def get_table(url):
"""Return HTML table from URL."""
xpath = '//*[@id="main_content"]/div/div[1]/table'
return lxml.html.parse(url).xpath(xpath)[0]
def row_type(row):
"""Categorize a row."""
cells = row.iterchildren()
if next(cells).tag == 'th':
# first cell is a <th> element
return REGION
elif next(cells).text_content().strip() == ' ':
# second cell is empty
return SUBREGION
else:
return DATAROW
assert row_type(region_row) == REGION, row_type(region_row)
assert row_type(subregion_row) == SUBREGION, row_type(subregion_row)
assert row_type(station_row) == DATAROW, row_type(station_row)
def get_region(row):
"""Get region name from a REGION row."""
return row.text_content().strip()
assert get_region(region_row) == 'NORTH COAST'
def get_subregion(row):
"""Parse subregion and month names from SUBREGION row."""
cell_text = [cell.text_content().strip() for cell in row]
return cell_text[0], cell_text[3:]
assert get_subregion(subregion_row)[0] == 'SMITH RIVER'
assert get_subregion(subregion_row)[1] == \
'Oct Nov Dec Jan Feb Mar Apr May Jun Jul Aug Sep Oct-Sep Wat-Yr'.split()
def get_elev_abbrev(cell):
"""Parse elevation and station abbreviation."""
tc = cell.text_content().strip()
return int(tc[:-6]), tc[-4:-1]
assert get_elev_abbrev(station_row.getchildren()[1]) == (384, 'GAS')
def get_precip_data(cell):
"""Parse a precip data cell."""
tc = [s.strip() for s in cell.itertext()]
for i, t in enumerate(tc):
if t in {'---', ''}:
tc[i] = float('nan')
elif t.endswith('%'):
tc[i] = float(t[:-1])
else:
tc[i] = float(t)
return tc
assert get_precip_data(station_row.getchildren()[3]) == [8.4, 7.53, 112]
def iter_months(months, cells):
"""Iterate over months and month data cells."""
for m, c in zip(months, cells):
yield m, get_precip_data(c)
def table_to_frame(table, year):
"""Turn HTML table of precip data into a DataFrame."""
datarows = []
rows = table.iterchildren()
while True:
try:
r = next(rows)
except StopIteration:
break
rt = row_type(r)
if rt == REGION:
region = get_region(r)
next(rows) # burn the "Station Elev." rows
elif rt == SUBREGION:
subregion, months = get_subregion(r)
elif rt == DATAROW:
cells = r.iterchildren()
station = next(cells).text_content().strip()
elev, abbrev = get_elev_abbrev(next(cells))
next(cells) # burn the Precip Average %-avg cell
for m, values in iter_months(months, cells):
dr = [region, subregion, station, abbrev, elev, year, m]
datarows.append(dr + list(values))
return pd.DataFrame(datarows, columns=[
'region', 'subregion', 'station', 'abbreviation', 'elevation',
'year', 'month', 'precip', 'avg precip', 'pct of avg'])
years = [2011, 2012, 2013, 2014]
url = 'http://cdec.water.ca.gov/cgi-progs/reports/PRECIPOUT.{}'
def all_data(url, years):
frames = []
for y in years:
df = table_to_frame(get_table(url.format(y)), y)
frames.append(df)
return pd.concat(frames, ignore_index=True)
df = all_data(url, years)
df
region | subregion | station | abbreviation | elevation | year | month | precip | avg precip | pct of avg | |
---|---|---|---|---|---|---|---|---|---|---|
0 | NORTH COAST | SMITH RIVER | Gasquet Ranger Station | GAS | 384 | 2011 | Oct | 8.40 | 7.53 | 112 |
1 | NORTH COAST | SMITH RIVER | Gasquet Ranger Station | GAS | 384 | 2011 | Nov | 12.42 | 14.14 | 88 |
2 | NORTH COAST | SMITH RIVER | Gasquet Ranger Station | GAS | 384 | 2011 | Dec | 25.00 | 16.37 | 153 |
3 | NORTH COAST | SMITH RIVER | Gasquet Ranger Station | GAS | 384 | 2011 | Jan | 4.91 | 16.45 | 30 |
4 | NORTH COAST | SMITH RIVER | Gasquet Ranger Station | GAS | 384 | 2011 | Feb | 9.53 | 11.95 | 80 |
5 | NORTH COAST | SMITH RIVER | Gasquet Ranger Station | GAS | 384 | 2011 | Mar | 26.47 | 11.08 | 239 |
6 | NORTH COAST | SMITH RIVER | Gasquet Ranger Station | GAS | 384 | 2011 | Apr | 10.25 | 6.47 | 158 |
7 | NORTH COAST | SMITH RIVER | Gasquet Ranger Station | GAS | 384 | 2011 | May | 4.67 | 4.43 | 105 |
8 | NORTH COAST | SMITH RIVER | Gasquet Ranger Station | GAS | 384 | 2011 | Jun | 1.62 | 0.83 | 195 |
9 | NORTH COAST | SMITH RIVER | Gasquet Ranger Station | GAS | 384 | 2011 | Jul | 0.39 | 0.56 | 70 |
10 | NORTH COAST | SMITH RIVER | Gasquet Ranger Station | GAS | 384 | 2011 | Aug | 0.00 | 0.68 | 0 |
11 | NORTH COAST | SMITH RIVER | Gasquet Ranger Station | GAS | 384 | 2011 | Sep | 0.34 | 1.84 | 18 |
12 | NORTH COAST | SMITH RIVER | Gasquet Ranger Station | GAS | 384 | 2011 | Oct-Sep | 104.00 | 92.33 | 113 |
13 | NORTH COAST | SMITH RIVER | Gasquet Ranger Station | GAS | 384 | 2011 | Wat-Yr | NaN | 92.33 | 113 |
14 | NORTH COAST | KLAMATH RIVER | Callahan | CAL | 3185 | 2011 | Oct | 4.34 | 1.40 | 310 |
15 | NORTH COAST | KLAMATH RIVER | Callahan | CAL | 3185 | 2011 | Nov | 1.66 | 2.75 | 60 |
16 | NORTH COAST | KLAMATH RIVER | Callahan | CAL | 3185 | 2011 | Dec | 5.65 | 3.67 | 154 |
17 | NORTH COAST | KLAMATH RIVER | Callahan | CAL | 3185 | 2011 | Jan | 0.91 | 3.68 | 25 |
18 | NORTH COAST | KLAMATH RIVER | Callahan | CAL | 3185 | 2011 | Feb | 1.06 | 2.58 | 41 |
19 | NORTH COAST | KLAMATH RIVER | Callahan | CAL | 3185 | 2011 | Mar | 5.42 | 2.14 | 253 |
20 | NORTH COAST | KLAMATH RIVER | Callahan | CAL | 3185 | 2011 | Apr | 0.96 | 1.38 | 70 |
21 | NORTH COAST | KLAMATH RIVER | Callahan | CAL | 3185 | 2011 | May | 1.54 | 0.83 | 186 |
22 | NORTH COAST | KLAMATH RIVER | Callahan | CAL | 3185 | 2011 | Jun | 1.42 | 0.67 | 212 |
23 | NORTH COAST | KLAMATH RIVER | Callahan | CAL | 3185 | 2011 | Jul | 0.42 | 0.18 | 233 |
24 | NORTH COAST | KLAMATH RIVER | Callahan | CAL | 3185 | 2011 | Aug | 0.23 | 0.29 | 79 |
25 | NORTH COAST | KLAMATH RIVER | Callahan | CAL | 3185 | 2011 | Sep | 0.09 | 0.48 | 19 |
26 | NORTH COAST | KLAMATH RIVER | Callahan | CAL | 3185 | 2011 | Oct-Sep | 23.70 | 20.05 | 118 |
27 | NORTH COAST | KLAMATH RIVER | Callahan | CAL | 3185 | 2011 | Wat-Yr | NaN | 20.05 | 118 |
28 | NORTH COAST | KLAMATH RIVER | Fort Jones RS | FJN | 2725 | 2011 | Oct | 3.25 | 1.50 | 217 |
29 | NORTH COAST | KLAMATH RIVER | Fort Jones RS | FJN | 2725 | 2011 | Nov | 2.64 | 2.90 | 91 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
9650 | COLORADO RIVER | COLORADO DESERT | Blythe | BLY | 390 | 2014 | Jul | 0.00 | 0.19 | 0 |
9651 | COLORADO RIVER | COLORADO DESERT | Blythe | BLY | 390 | 2014 | Aug | 0.49 | 0.68 | 72 |
9652 | COLORADO RIVER | COLORADO DESERT | Blythe | BLY | 390 | 2014 | Oct-Aug | 1.38 | 3.23 | 43 |
9653 | COLORADO RIVER | COLORADO DESERT | Blythe | BLY | 390 | 2014 | Wat-Yr | NaN | 3.52 | 39 |
9654 | COLORADO RIVER | COLORADO DESERT | Niland | NLD | -60 | 2014 | Oct | 0.03 | 0.24 | 12 |
9655 | COLORADO RIVER | COLORADO DESERT | Niland | NLD | -60 | 2014 | Nov | NaN | 0.24 | NaN |
9656 | COLORADO RIVER | COLORADO DESERT | Niland | NLD | -60 | 2014 | Dec | NaN | 0.38 | NaN |
9657 | COLORADO RIVER | COLORADO DESERT | Niland | NLD | -60 | 2014 | Jan | 0.00 | 0.33 | 0 |
9658 | COLORADO RIVER | COLORADO DESERT | Niland | NLD | -60 | 2014 | Feb | 0.00 | 0.27 | 0 |
9659 | COLORADO RIVER | COLORADO DESERT | Niland | NLD | -60 | 2014 | Mar | NaN | 0.27 | NaN |
9660 | COLORADO RIVER | COLORADO DESERT | Niland | NLD | -60 | 2014 | Apr | NaN | 0.10 | NaN |
9661 | COLORADO RIVER | COLORADO DESERT | Niland | NLD | -60 | 2014 | May | NaN | 0.01 | NaN |
9662 | COLORADO RIVER | COLORADO DESERT | Niland | NLD | -60 | 2014 | Jun | 0.00 | 0.01 | 0 |
9663 | COLORADO RIVER | COLORADO DESERT | Niland | NLD | -60 | 2014 | Jul | 0.12 | 0.24 | 50 |
9664 | COLORADO RIVER | COLORADO DESERT | Niland | NLD | -60 | 2014 | Aug | NaN | 0.32 | NaN |
9665 | COLORADO RIVER | COLORADO DESERT | Niland | NLD | -60 | 2014 | Oct-Aug | NaN | 2.41 | NaN |
9666 | COLORADO RIVER | COLORADO DESERT | Niland | NLD | -60 | 2014 | Wat-Yr | NaN | 2.63 | NaN |
9667 | COLORADO RIVER | COLORADO DESERT | Imperial Valley | IMP | -64 | 2014 | Oct | 0.01 | 0.23 | 4 |
9668 | COLORADO RIVER | COLORADO DESERT | Imperial Valley | IMP | -64 | 2014 | Nov | 0.95 | 0.18 | 528 |
9669 | COLORADO RIVER | COLORADO DESERT | Imperial Valley | IMP | -64 | 2014 | Dec | 0.00 | 0.43 | 0 |
9670 | COLORADO RIVER | COLORADO DESERT | Imperial Valley | IMP | -64 | 2014 | Jan | 0.00 | 0.38 | 0 |
9671 | COLORADO RIVER | COLORADO DESERT | Imperial Valley | IMP | -64 | 2014 | Feb | 0.00 | 0.36 | 0 |
9672 | COLORADO RIVER | COLORADO DESERT | Imperial Valley | IMP | -64 | 2014 | Mar | 0.01 | 0.19 | 5 |
9673 | COLORADO RIVER | COLORADO DESERT | Imperial Valley | IMP | -64 | 2014 | Apr | 0.00 | 0.13 | 0 |
9674 | COLORADO RIVER | COLORADO DESERT | Imperial Valley | IMP | -64 | 2014 | May | 0.00 | 0.01 | 0 |
9675 | COLORADO RIVER | COLORADO DESERT | Imperial Valley | IMP | -64 | 2014 | Jun | 0.00 | 0.00 | NaN |
9676 | COLORADO RIVER | COLORADO DESERT | Imperial Valley | IMP | -64 | 2014 | Jul | 0.00 | 0.10 | 0 |
9677 | COLORADO RIVER | COLORADO DESERT | Imperial Valley | IMP | -64 | 2014 | Aug | NaN | 0.36 | NaN |
9678 | COLORADO RIVER | COLORADO DESERT | Imperial Valley | IMP | -64 | 2014 | Oct-Aug | NaN | 2.37 | NaN |
9679 | COLORADO RIVER | COLORADO DESERT | Imperial Valley | IMP | -64 | 2014 | Wat-Yr | NaN | 2.73 | NaN |
9680 rows × 10 columns
df.to_csv('precip_data.csv', index=False)
!head precip_data.csv
region,subregion,station,abbreviation,elevation,year,month,precip,avg precip,pct of avg NORTH COAST,SMITH RIVER,Gasquet Ranger Station,GAS,384,2011,Oct,8.4,7.53,112.0 NORTH COAST,SMITH RIVER,Gasquet Ranger Station,GAS,384,2011,Nov,12.42,14.14,88.0 NORTH COAST,SMITH RIVER,Gasquet Ranger Station,GAS,384,2011,Dec,25.0,16.37,153.0 NORTH COAST,SMITH RIVER,Gasquet Ranger Station,GAS,384,2011,Jan,4.91,16.45,30.0 NORTH COAST,SMITH RIVER,Gasquet Ranger Station,GAS,384,2011,Feb,9.53,11.95,80.0 NORTH COAST,SMITH RIVER,Gasquet Ranger Station,GAS,384,2011,Mar,26.47,11.08,239.0 NORTH COAST,SMITH RIVER,Gasquet Ranger Station,GAS,384,2011,Apr,10.25,6.47,158.0 NORTH COAST,SMITH RIVER,Gasquet Ranger Station,GAS,384,2011,May,4.67,4.43,105.0 NORTH COAST,SMITH RIVER,Gasquet Ranger Station,GAS,384,2011,Jun,1.62,0.83,195.0