import pandas as pd # 駅に関するデータ station_df = pd.read_csv('station20140303free.csv') # 路線に関するデータ connection_df = pd.read_csv('line20140303free.csv') # 駅と駅の接続データ、つまりグラフ graph_df = pd.read_csv('join20140303.csv') # 鉄道会社のデータ company_df = pd.read_csv('company20130120.csv') # 県名と県IDのテーブル pref_df = pd.read_csv('pref.csv') # 路線名のリスト(後ほど使用) line_names = connection_df[['line_cd', 'line_name']] line_names.head() station_df.drop(['station_name_k', 'station_name_r', 'e_sort'], axis=1, inplace=True) SCALE_FACTOR = 10000 station_df['x'] = station_df['lon'] * SCALE_FACTOR station_df['y'] = station_df['lat'] * (-SCALE_FACTOR) # 人間にとって読みやすいように、県のIDを県名に変換する merged_stations = pd.merge(pref_df, station_df, on = 'pref_cd') # 路線名も人が読めるものにする merged_stations = pd.merge(merged_stations, line_names, on='line_cd') # 必要の無くなったカラムを除去 merged_stations.drop(['pref_cd', 'line_cd'], axis=1, inplace=True) merged_stations.head() # 必要ないカラムの削除 connection_df.drop(['line_color_c', 'line_color_t', 'line_type', 'e_sort'], axis=1, inplace=True) connection_df.head(5) connection_final_df= pd.merge(connection_df, company_df, on='company_cd') connection_final_df.drop(['e_sort'], axis=1, inplace=True) connection_final_df.head() # グラフと路線データをマージ graph_new_df = pd.merge(connection_final_df, graph_df, on='line_cd') # 全てのユニークな駅IDを駅のデータフレームから取り出す all_stations = merged_stations['station_cd'].unique() # グラフデータに含まれるユニークな駅IDを抽出 st1 = graph_new_df['station_cd1'] st2 = graph_new_df['station_cd2'] stations_in_graph = pd.concat([st1,st2]).unique() # データのない駅をチェックする関数 def has_loc(station_id1, station_id2, all_stations): if station_id1 in all_stations and station_id2 in all_stations: return True else: return False # その関数を渡してラムダ式として適用 graph_new_df['has_station_data'] = graph_new_df.apply(lambda row: has_loc(row['station_cd1'], row['station_cd2'], all_stations), axis=1) # データのないエッジを除去する graph_final_df = graph_new_df[graph_new_df['has_station_data'] == True] graph_final_df.head(3) graph_final_df.to_csv('graph_disconnected.csv') merged_stations.to_csv('stations.csv') # グループを抽出 (非効率なので当然ながら遅いけどシンプルなので) groups = map(lambda x: merged_stations[merged_stations['station_g_cd'] == x], merged_stations['station_g_cd'].unique()) # 抽出のためのユーティリティ関数 def add_cl(df, edges): group_members = df['station_cd'] processed = set([]) map(lambda station_id: add_edge(station_id, group_members, edges, processed), group_members) def add_edge(current_station, stations, edges, processed): for station in stations: if station != current_station and station not in processed: edges.append([station, 0, current_station]) processed.add(current_station) # 新しいエッジを入れる入れ物 group_edges = [] map(lambda df: add_cl(df, group_edges) if len(df) != 1 else None, groups) # 最終的にはデータフレームに cliques_df = pd.DataFrame(group_edges, columns=['station_cd1', 'line_cd', 'station_cd2']) # クリークのみをSIF形式のテーブルとして書き出してみます cliques_df.to_csv('cliques.sif', sep=' ', index=False) merged_graph = pd.concat([graph_final_df, cliques_df]) merged_graph.to_csv('graph_connected.csv') import xml.etree.ElementTree as ET tree = ET.parse('fixed.xml') root = tree.getroot() PREFIX = '{http://nlftp.mlit.go.jp/ksj/schemas/ksj-app}' passenger_array = [] entries = root.findall('./' + PREFIX + 'TheNumberofTheStationPassengersGettingonandoff') # フィールド名の抽出 column_names = [] for col in entries[0]: column_names.append(col.tag.split('}')[1]) # データを抽出する for data in entries: row=[] for entry in data: if(type(entry.text) is unicode): row.append((entry.text).encode('utf-8')) else: row.append(entry.text) passenger_array.append(row) # データフレームへ passenger_df = pd.DataFrame(passenger_array, columns=column_names) # スペックシートから人力で人の読めるテーブルにする(ダメダメなやり方ですね...) railroad_division = [ ['普通鉄道 JR', '11'], ['普通鉄道', '12'], ['鋼索鉄道', '13'], ['懸垂式鉄道', '14'], ['跨座式鉄道', '15'], ['案内軌条式鉄道','16'], ['無軌条鉄道', '17'], ['軌道', '21'], ['懸垂式モノレール', '22'], ['跨座式モノレール', '23'], ['案内軌条式', '24'], ['浮上式', '25'] ] railroad_company_classification = [ ['JR新幹線', '1'], ['JR在来線', '2'], ['公営鉄道', '3'], ['民営鉄道', '4'], ['第三セクター', '5'] ] rd_df = pd.DataFrame(railroad_division, columns=['rail_type', 'railroadDivision']) company_type_df = pd.DataFrame(railroad_company_classification, columns=['company_type', 'railroadCompanyClassification']) passenger_df.to_csv('passenger_original.csv') passenger_df.head() remarks = pd.Series(passenger_df['remarks2011'].unique()) remarks[:10] passenger_df['is_complete'] = passenger_df.apply( lambda row: True if row['dataEorN2011'] == '1' and row['dataEorN2012'] == '1' and row['duplicate2011'] == '1' and row['duplicate2012'] == '1' and row['remarks2011'] == None and row['remarks2012'] == None else False, axis=1) passenger_filtered = passenger_df[passenger_df['is_complete']] # 要らなくなった情報を除去 passenger_filtered.drop(['is_complete', 'station', 'duplicate2011', 'duplicate2012', 'remarks2011', 'remarks2012', 'dataEorN2011', 'dataEorN2012'], axis=1, inplace=True) passenger_filtered.head() temp_df = pd.merge(passenger_filtered, rd_df, on='railroadDivision') passenger_final_df = pd.merge(temp_df, company_type_df, on='railroadCompanyClassification') # 必要なくなったカラムを消去 passenger_final_df.drop(['railroadDivision', 'railroadCompanyClassification'], axis=1, inplace=True) passenger_final_df.to_csv('passenger_final.csv') passenger_final_df[4000:4010] passenger_final_df[passenger_final_df['stationName'] == '南阿蘇白川水源'] tokyo_metro = passenger_final_df[passenger_final_df['administrationCompany'] == '東京地下鉄'] def create_line_name(company, line): prefix = company.replace('地下鉄', 'メトロ') suffix = line.split('線')[1] return prefix + suffix + '線' tokyo_metro['line_name'] = tokyo_metro.apply(lambda row: create_line_name(row['administrationCompany'], row['routeName']), axis=1) tokyo_metro['station_name'] = tokyo_metro['stationName'] merged = pd.merge(merged_stations, tokyo_metro, on=['station_name', 'line_name']) merged['passengers2012'] = merged['passengers2012'].astype(int) merged['passengers2011'] = merged['passengers2011'].astype(int) # 乗降客トップ10の駅を表示して、妥当性を検討する sorted_df = merged.sort_index(by='passengers2012', ascending=False) sorted_df[['station_name', 'line_name', 'passengers2012']][:10] # 結果の書き出し merged.to_csv('metro_data_table.csv') from lxml.html import parse from urllib2 import urlopen page = parse(urlopen('http://ja.wikipedia.org/wiki/%E6%97%A5%E6%9C%AC%E3%81%AE%E9%89%84%E9%81%93%E3%83%A9%E3%82%A4%E3%83%B3%E3%82%AB%E3%83%A9%E3%83%BC%E4%B8%80%E8%A6%A7')) doc = page.getroot() table_rows = doc.findall('.//tr') color_list = [] for row in table_rows: line_name = None line_color = None row_data = row.findall('.//%s' % 'td') for val in row_data: style = val.get('style') for child in val: if child.tag is 'a': line_name = child.get('title') if style is not None: line_color = style if line_name is not None and line_color is not None: new_color = line_color.split(';')[0].replace('background:', '') color_list.append([line_name.encode('utf-8'), new_color.encode('utf-8')]) # データフレームに。  color_df = pd.DataFrame(color_list, columns=['line_name', 'line_color']) # CSVとして書き出し color_df.to_csv('line_colors.csv')