from __future__ import division from glob import glob import csv,re for name in glob('data/raw/*.dat'): label = name[15:19] print "Parsing data:",label with open(name) as in_file: with open("data/csv/"+label+".csv",'w') as out_file: writer = csv.writer(out_file) header = ["id","year"] header += [str(month+1)+"_"+label for month in xrange(12)] writer.writerow(header) for line in in_file: # if int(line[11:15]) < 2000: # continue components = [line[0:11],line[11:15]] components += [int(line[19+x*8:24+x*8])/100 for x in range(12)] writer.writerow(components) def title(name): """Title case with correct handling of internal apostrophes""" return re.sub("([a-z])'([A-Z])", lambda m: m.group(0).lower(), name.title()) cc_file = open('data/raw/country-codes') ccodes = {l[:3]:title(l[4:].strip()) for l in cc_file} for name in glob('data/raw/*.inv'): label = name[15:19] print "Parsing index:",label with open(name) as in_file: with open("data/csv/"+label+"-inv.csv",'w') as out_file: writer = csv.writer(out_file) writer.writerow(["id","lat","long","name","country"]) for line in in_file: # if not line[74:79].strip().isdigit() or int(line[74:79]) < 100: # continue name = title(line[38:68]) name = name[:name.find(" ")].strip() components = [line[0:11],line[12:20],line[21:30],name,ccodes[line[:3]]] writer.writerow(components) import pandas as pd tavg_dataframe = pd.read_csv("data/csv/tavg.csv") tavg_dataframe.head() import numpy as np data = None for typ in ["tavg","tmin","tmax"]: # -99.99 is used in this dataset to represent missing data, so we replace it with NaN (not a number) # This keeps it from being used in calculations. typ_data = pd.read_csv("data/csv/"+typ+".csv").replace(-99.99,np.NaN) typ_data[typ] = typ_data.drop(['id','year'],axis=1).mean(axis=1,skipna=True) typ_data = typ_data[['id','year',typ]] if data is None: data = typ_data else: data = pd.merge(data, typ_data, on=['year','id']) index = pd.read_csv("data/csv/tavg-inv.csv") data = pd.merge(data, index, on=['id']) data = data.dropna() data.head() import matplotlib.pyplot as plt %matplotlib inline years = xrange(data['year'].min(),data['year'].max()) counts = [data[data.year == year].count() for year in years] plt.plot(years,counts,color='red') plt.xlabel('Year'); plt.ylabel('Stations Online') plt.show() from datetime import datetime data = data[(data.year >= 1950) & (data.year <= 2013)] data['date'] = data['year'].apply(lambda x: datetime(x,1,1)) early_years = data[(data.year >= 1950) & (data.year < 1955)][['id','tavg']] early_years = early_years.groupby('id').mean() late_years = data[(data.year >= 2008) & (data.year < 2013)][['id','tavg']] late_years = late_years.groupby('id').mean() deltas = (late_years['tavg']-early_years['tavg']).dropna() change = index.copy().set_index('id') change['delta'] = deltas change = change.dropna() print "Total stations represented:",len(change) change.groupby('country').count().sort('country',ascending=False)[['country']] print change['delta'].describe() plt.hist(list(change['delta']),50) plt.xlim([-5,5]) plt.title('Average Temperature Change, 1950-2010') plt.xlabel('Delta Temperature (C)') plt.ylabel('Count of Stations') plt.savefig('viz/temp_change.png') plt.show() change.delta[change.delta > 5] = 5 change.delta[change.delta < -5] = -5 change['cat'] = pd.cut(change['delta'], 7) cats = change.sort('delta')['cat'].unique() colors = ["#2166ac","#67a9cf","#d1e5f0","#f7f7f7","#fddbc7","#ef8a62","#b2182b"] cat_colors = {cat:color for cat, color in zip(cats,colors)} list(cats) import folium, vincent vincent.core.initialize_notebook() def popup_chart(station_id): weather_frame = data[data.id == station_id][['date','tmax','tavg','tmin']] weather_frame = weather_frame.rename(columns={'tmax':'Max','tmin':'Min','tavg':'Average'}).set_index('date') station = index[index.id == station_id].iloc[0] # Find the station in the index chart = vincent.Line(weather_frame) chart.axis_titles(x=station['name']+", "+station["country"],y="Temp (C)") chart.legend(title="Key") chart.width=400 chart.height=200 chart.title=station['name']+", "+station["country"] chart.padding={'top':20,'right':80,'bottom':40,'left':40} chart.to_json("viz/historical_map/data_{0}.json".format(station['id'])) return chart chart = popup_chart(42572734000) chart.display() folium.initialize_notebook() f_map = folium.Map( location=[0,0], zoom_start=2, tiles="Mapbox Bright") for station_id,station in change.iterrows(): f_map.polygon_marker([station['lat'],station['long']], radius=5, popup=(popup_chart(station_id),"data_{0}.json".format(station_id)), line_color='grey', line_weight=1, fill_color=cat_colors[station['cat']], fill_opacity=1 ) f_map.create_map(path='viz/historical_map/map.html') f_map