from pathlib import Path import pandas as pd import numpy as np from pyquery import PyQuery as pq import re BASE_PATH = Path('../勞動部') OUT_PATH = BASE_PATH / 'parsed' html_list = [p for p in BASE_PATH.iterdir() if p.suffix == '.htm'] html_list dom = pq(html_list[0].open().read()) dfs = pd.read_html(dom.html(), header=0, infer_types=False) df = dfs[0] shifted_ix = df.loc[df['單位'] == 'nan'].index df.ix[shifted_ix, 1:] = df.ix[shifted_ix, :-1].values df.loc[shifted_ix, '季'] = np.NaN df = df.ffill() df['媒體'] = df.apply( lambda r: '{}:{}'.format(r['媒體型態'], r['刊播媒體']), axis=1 ) new_index = df.columns.values new_index[1] = '主題' new_index[4] = '時間' new_index[6] = '金額' new_index[7] = '執行單位' df.columns = new_index df['備註']= df.apply( lambda r: '露出次數:{}\n季:{}'.format(r[5], r[0]), axis=1 ) parse_budget = re.compile('(.*)」?總經費(.*):\s*([0-9,]*)\s*(.*)', re.UNICODE).match def norm_budget(row): new_row = row.copy() v = row['金額'] if v.isnumeric(): # 正常的金額 # print(v, 'is a valid number') new_row['金額'] = int(v) return new_row match = parse_budget(v) if match: prj, prj_note, budget, desc = match.groups() prj += prj_note # special case if '報紙宣導案' in prj: prj = '「報紙宣導案」' if prj in all_prj: # 已經出現過的專案 # print(prj, 'appeared, budget set to 0') new_row['金額'] = 0 new_row['備註'] += ('\n專案:' + prj + desc) return new_row else: budget = int(budget.replace(',', '')) # print('add', prj, 'with', budget) all_prj.add(prj) new_row['金額'] = budget new_row['備註'] += ('\n專案:' + prj + desc) return new_row else: print(v, 'is sth strange') return new_row all_prj = set() df = df.apply(norm_budget, axis=1) df['行政部會'] = '勞動部' df['民國年'] = '101' df = df[['主題', '媒體', '民國年', '時間', '金額', '執行單位', '行政部會', '備註']] out_stem = str(OUT_PATH / '{}'.format(html_list[0].stem)) df.to_csv(out_stem + '.csv', index=False) df.to_excel(out_stem + '.xlsx', index=False) dom = pq(html_list[1].open().read()) df = pd.read_html(dom.html(), header=0, infer_types=False)[0] shifted_ix = df.loc[df['單位'] == 'nan'].index df.ix[shifted_ix, 1:] = df.ix[shifted_ix, :-1].values df.loc[shifted_ix, '季'] = np.NaN # row 1 is special case df.ix[1, 2:6] = df.ix[1, 1:5].values df.iloc[1, 6] = np.NaN df = df.ffill() df.head() df['媒體'] = df.apply( lambda r: '{}:{}'.format(r['媒體型態'], r['刊播媒體']), axis=1 ) new_index = df.columns.values new_index[1] = '主題' new_index[4] = '時間' new_index[6] = '金額' new_index[7] = '執行單位' df.columns = new_index df['備註']= df.apply( lambda r: '露出次數:{}\n季:{}'.format(r[5], r[0]), axis=1 ) all_prj = set() df = df.apply(norm_budget, axis=1) df.head()