import lxml.html
import numpy as np
import pandas as pd
# constants
REGION = 'region'
SUBREGION = 'subregion'
DATAROW = 'datarow'
h = lxml.html.parse('http://cdec.water.ca.gov/cgi-progs/reports/PRECIPOUT.2011')
table = h.xpath('//*[@id="main_content"]/div/div[1]/table')[0]
rows = table.getchildren()
region_row = rows[0]
subregion_row = rows[2]
station_row = rows[3]
def get_table(url):
"""Return HTML table from URL."""
xpath = '//*[@id="main_content"]/div/div[1]/table'
return lxml.html.parse(url).xpath(xpath)[0]
def row_type(row):
"""Categorize a row."""
cells = row.iterchildren()
if next(cells).tag == 'th':
# first cell is a
element
return REGION
elif next(cells).text_content().strip() == ' ':
# second cell is empty
return SUBREGION
else:
return DATAROW
assert row_type(region_row) == REGION, row_type(region_row)
assert row_type(subregion_row) == SUBREGION, row_type(subregion_row)
assert row_type(station_row) == DATAROW, row_type(station_row)
def get_region(row):
"""Get region name from a REGION row."""
return row.text_content().strip()
assert get_region(region_row) == 'NORTH COAST'
def get_subregion(row):
"""Parse subregion and month names from SUBREGION row."""
cell_text = [cell.text_content().strip() for cell in row]
return cell_text[0], cell_text[3:]
assert get_subregion(subregion_row)[0] == 'SMITH RIVER'
assert get_subregion(subregion_row)[1] == \
'Oct Nov Dec Jan Feb Mar Apr May Jun Jul Aug Sep Oct-Sep Wat-Yr'.split()
def get_elev_abbrev(cell):
"""Parse elevation and station abbreviation."""
tc = cell.text_content().strip()
return int(tc[:-6]), tc[-4:-1]
assert get_elev_abbrev(station_row.getchildren()[1]) == (384, 'GAS')
def get_precip_data(cell):
"""Parse a precip data cell."""
tc = [s.strip() for s in cell.itertext()]
for i, t in enumerate(tc):
if t in {'---', ''}:
tc[i] = float('nan')
elif t.endswith('%'):
tc[i] = float(t[:-1])
else:
tc[i] = float(t)
return tc
assert get_precip_data(station_row.getchildren()[3]) == [8.4, 7.53, 112]
def iter_months(months, cells):
"""Iterate over months and month data cells."""
for m, c in zip(months, cells):
yield m, get_precip_data(c)
def table_to_frame(table, year):
"""Turn HTML table of precip data into a DataFrame."""
datarows = []
rows = table.iterchildren()
while True:
try:
r = next(rows)
except StopIteration:
break
rt = row_type(r)
if rt == REGION:
region = get_region(r)
next(rows) # burn the "Station Elev." rows
elif rt == SUBREGION:
subregion, months = get_subregion(r)
elif rt == DATAROW:
cells = r.iterchildren()
station = next(cells).text_content().strip()
elev, abbrev = get_elev_abbrev(next(cells))
next(cells) # burn the Precip Average %-avg cell
for m, values in iter_months(months, cells):
dr = [region, subregion, station, abbrev, elev, year, m]
datarows.append(dr + list(values))
return pd.DataFrame(datarows, columns=[
'region', 'subregion', 'station', 'abbreviation', 'elevation',
'year', 'month', 'precip', 'avg precip', 'pct of avg'])
years = [2011, 2012, 2013, 2014]
url = 'http://cdec.water.ca.gov/cgi-progs/reports/PRECIPOUT.{}'
def all_data(url, years):
frames = []
for y in years:
df = table_to_frame(get_table(url.format(y)), y)
frames.append(df)
return pd.concat(frames, ignore_index=True)
df = all_data(url, years)
df
df.to_csv('precip_data.csv', index=False)
!head precip_data.csv
|