import os import os.path import datetime import pandas as pd # Download CSV files def download_csv(filenames): for filename in filenames: print("download {filename}...".format(filename=filename)) os.system("curl http://www.city.sendai.jp/kikaku/seisaku/toukei/jinkou/kakusai/{filename} -o data/{filename}".format(filename=filename)) print("Done!") # download_filenames = ["h2004.xls", "h2104.xls", "h2204.xls" , "h2304.xls","h2404.xls", "h2504.xls", "h2604.xls"] download_filenames = ["h2004.xls"] download_csv(download_filenames) def generate_df(filepath): print("{datetime} {filepath}".format(datetime=datetime.datetime.now(), filepath=filepath)) filename = os.path.basename(filepath) xls = pd.ExcelFile(filepath) # 全ての人口統計を入れるデータフレーム df_all = pd.DataFrame(columns=['year', 'month', 'ward', 'town', 'gender', 'age', 'head_count']) print " Sheet -> ", for sheet_index_num, sheet_name in enumerate(xls.sheet_names): # 『合計』のシートは除く if u'合計' in sheet_name or u'計' in sheet_name or u'総数' in sheet_name: continue print sheet_name, df_xls= xls.parse(xls.sheet_names[sheet_index_num], skiprows=3) heisei_year = filename[1:3] df_xls['year'] = int(heisei_year) + 1988 df_xls['month'] = int(filename[3:5]) # ward と town をクレンジングして追加 df_xls['ward'] = df_xls.iloc[:,0].fillna(method='ffill').apply(lambda x : x.strip()) df_xls['town'] = df_xls.iloc[:,1].fillna(method='ffill').apply(lambda x : x.strip()) # 『町名』列にある不要な『合計』行を削除 df_xls = df_xls[(df_xls['town'] != u'合計') & (df_xls['town'] != u'計') & (df_xls['town'] != u'総数') ] # 性別を振り分け gender = None if u'男' in sheet_name: gender = '男' if u'女' in sheet_name: gender = '男' df_xls['gender'] = gender df_xls.head() # 年齢毎に処理 for age in range(0, 101): df_xls['age'] = age df_xls['head_count'] = df_xls.iloc[:, age + 3].apply(lambda x: x if isinstance(x, int) else None).fillna(method='ffill') df_all= pd.concat([df_all, df_xls.iloc[:, 104:]]) print("Done!") return df_all def get_df_all_demographic(filenames, dirpath = 'data/'): df = pd.DataFrame() dirpath = 'data/' print("Convert {num} files...".format(num=len(filenames))) for filename in filenames: filepath = dirpath + filename df = pd.concat([df, generate_df(filepath)]) return df # filenames = ["h2004.xls", "h2104.xls", "h2204.xls" , "h2304.xls","h2404.xls", "h2504.xls", "h2604.xls"] # NOTE: h20~h23のcsvのデータ形式が異なるので除外 filenames = ["h2404.xls", "h2504.xls", "h2604.xls"] # データ読み込み df_demographic = get_df_all_demographic(filenames) df_demographic.info() df_demographic # 読み込んだデータをCSVに書き出し df_demographic.to_csv('data/all_demographic.csv', index=False, encoding='UTF-8') %ls -l data/all_demographic.csv # 2014年4月の仙台の総人口 df_demographic[df_demographic.year == 2014].head_count.sum() df_work = pd.DataFrame(columns=['year', 'group', 'ward', 'head_count'], index=range(len(df_demographic.ward.unique()))) df_work latest_year = int(df_demographic.year.unique().max()) df_work['ward'] = df_demographic[df_demographic.year == latest_year].groupby(['ward'])['head_count'].sum().index df_work['group'] = u'中学生' df_work # 最新年から10年間 df = pd.DataFrame() for i in range(10): df_work.year = latest_year + i df_work.head_count = df_demographic[(df_demographic.year == latest_year) & (12 - i <= df_demographic.age) & (df_demographic.age <= 12 + 2 - i)].groupby(['ward'])['head_count'].sum().values df = pd.concat([df, df_work]) df df_pivot = pd.pivot_table(df, values='head_count', index='year', columns='ward', aggfunc=sum, fill_value=0) df_pivot df_pivot.plot() prop_mac_jp = {'fname' : r'/Library/Fonts/Osaka.ttf'} plt.legend(prop=prop_mac_jp, loc='lower left')