import lxml.html import numpy as np import pandas as pd # constants REGION = 'region' SUBREGION = 'subregion' DATAROW = 'datarow' h = lxml.html.parse('http://cdec.water.ca.gov/cgi-progs/reports/PRECIPOUT.2011') table = h.xpath('//*[@id="main_content"]/div/div[1]/table')[0] rows = table.getchildren() region_row = rows[0] subregion_row = rows[2] station_row = rows[3] def get_table(url): """Return HTML table from URL.""" xpath = '//*[@id="main_content"]/div/div[1]/table' return lxml.html.parse(url).xpath(xpath)[0] def row_type(row): """Categorize a row.""" cells = row.iterchildren() if next(cells).tag == 'th': # first cell is a element return REGION elif next(cells).text_content().strip() == ' ': # second cell is empty return SUBREGION else: return DATAROW assert row_type(region_row) == REGION, row_type(region_row) assert row_type(subregion_row) == SUBREGION, row_type(subregion_row) assert row_type(station_row) == DATAROW, row_type(station_row) def get_region(row): """Get region name from a REGION row.""" return row.text_content().strip() assert get_region(region_row) == 'NORTH COAST' def get_subregion(row): """Parse subregion and month names from SUBREGION row.""" cell_text = [cell.text_content().strip() for cell in row] return cell_text[0], cell_text[3:] assert get_subregion(subregion_row)[0] == 'SMITH RIVER' assert get_subregion(subregion_row)[1] == \ 'Oct Nov Dec Jan Feb Mar Apr May Jun Jul Aug Sep Oct-Sep Wat-Yr'.split() def get_elev_abbrev(cell): """Parse elevation and station abbreviation.""" tc = cell.text_content().strip() return int(tc[:-6]), tc[-4:-1] assert get_elev_abbrev(station_row.getchildren()[1]) == (384, 'GAS') def get_precip_data(cell): """Parse a precip data cell.""" tc = [s.strip() for s in cell.itertext()] for i, t in enumerate(tc): if t in {'---', ''}: tc[i] = float('nan') elif t.endswith('%'): tc[i] = float(t[:-1]) else: tc[i] = float(t) return tc assert get_precip_data(station_row.getchildren()[3]) == [8.4, 7.53, 112] def iter_months(months, cells): """Iterate over months and month data cells.""" for m, c in zip(months, cells): yield m, get_precip_data(c) def table_to_frame(table, year): """Turn HTML table of precip data into a DataFrame.""" datarows = [] rows = table.iterchildren() while True: try: r = next(rows) except StopIteration: break rt = row_type(r) if rt == REGION: region = get_region(r) next(rows) # burn the "Station Elev." rows elif rt == SUBREGION: subregion, months = get_subregion(r) elif rt == DATAROW: cells = r.iterchildren() station = next(cells).text_content().strip() elev, abbrev = get_elev_abbrev(next(cells)) next(cells) # burn the Precip Average %-avg cell for m, values in iter_months(months, cells): dr = [region, subregion, station, abbrev, elev, year, m] datarows.append(dr + list(values)) return pd.DataFrame(datarows, columns=[ 'region', 'subregion', 'station', 'abbreviation', 'elevation', 'year', 'month', 'precip', 'avg precip', 'pct of avg']) years = [2011, 2012, 2013, 2014] url = 'http://cdec.water.ca.gov/cgi-progs/reports/PRECIPOUT.{}' def all_data(url, years): frames = [] for y in years: df = table_to_frame(get_table(url.format(y)), y) frames.append(df) return pd.concat(frames, ignore_index=True) df = all_data(url, years) df df.to_csv('precip_data.csv', index=False) !head precip_data.csv