!pip install cloudant import cloudant account = cloudant.Account('parente') database = account.database('rita_transtats_2014_06') database.get().json() items = [] for i, item in enumerate(database.all_docs(params={'include_docs' : True})): if i > 1: break items.append(item) print items import pandas pandas.DataFrame([item['doc'] for item in items]) columns = [u'FL_DATE', u'ORIGIN_STATE_ABR', u'DEST_STATE_ABR', u'ARR_DEL15', u'ARR_DELAY_NEW', u'DEP_DEL15', u'DEP_DELAY_NEW', u'DISTANCE', u'DISTANCE_GROUP',] %%time dfs = [] buff = [] for i, item in enumerate(database.all_docs(params={'include_docs' : True})): buff.append(item['doc']) if i > 0 and i % 20000 == 0: print 'Processed #{}'.format(i) df = pandas.DataFrame(buff, columns=columns) dfs.append(df) buff = [] # don't forget the leftovers df = pandas.DataFrame(buff, columns=columns) dfs.append(df) df = pandas.concat(dfs) assert len(df) == database.get().json()['doc_count'] del dfs del buff !free -m df = df.reset_index(drop=True) df.DEP_DEL15.value_counts() / len(df) df.ARR_DEL15.value_counts() / len(df) %matplotlib inline import matplotlib.pyplot as plt import seaborn as sns sns.set_palette("deep", desat=.6) colors = sns.color_palette("deep") sns.set_context(rc={"figure.figsize": (18, 5)}) by_origin_state = df.groupby('ORIGIN_STATE_ABR') departure_delay_counts = by_origin_state.DEP_DEL15.sum() by_dest_state = df.groupby('DEST_STATE_ABR') arrival_delay_counts = by_dest_state.ARR_DEL15.sum() delay_df = pandas.DataFrame([departure_delay_counts, arrival_delay_counts]).T delay_df.sort('DEP_DEL15', ascending=False).plot(kind='bar', title='Number of delayed flights by state') pct_departure_delay = departure_delay_counts / df.ORIGIN_STATE_ABR.value_counts() pct_arrival_delay = arrival_delay_counts / df.DEST_STATE_ABR.value_counts() pct_departure_delay.order(ascending=False).plot(kind='bar', title='% flights with departure delays by origin state') pct_arrival_delay.order(ascending=False).plot(kind='bar', color=colors[1], title='% flights with arrival delay by destination state') pct_delay_df = pandas.DataFrame([pct_departure_delay, pct_arrival_delay], index=['PCT_DEP_DEL15', 'PCT_ARR_DEL15']).T pct_delay_df.sort('PCT_ARR_DEL15', ascending=False).plot(kind='bar', title='Overlapping % delay plots for comparison') from __future__ import division delay_counts_df = df[['ORIGIN_STATE_ABR', 'DEST_STATE_ABR', 'ARR_DEL15']].groupby(['ORIGIN_STATE_ABR', 'DEST_STATE_ABR']).sum() delay_counts_df.head() support = (delay_counts_df / len(df)) support.head() support = support.unstack() support.head() support = support.T.reset_index(level=0, drop=True).T support.head() import numpy as np def asymmatplot(plotmat, names=None, cmap="Greys", cmap_range=None, ax=None, **kwargs): ''' Plot an asymmetric matrix with colormap and statistic values. A modification of the symmatplot() function in Seaborn to show the upper-half of the matrix. See https://github.com/mwaskom/seaborn/blob/master/seaborn/linearmodels.py for the original. ''' if ax is None: ax = plt.gca() nvars = len(plotmat) if cmap_range is None: vmax = np.nanmax(plotmat) * 1.15 vmin = np.nanmin(plotmat) * 1.15 elif len(cmap_range) == 2: vmin, vmax = cmap_range else: raise ValueError("cmap_range argument not understood") mat_img = ax.matshow(plotmat, cmap=cmap, vmin=vmin, vmax=vmax, **kwargs) plt.colorbar(mat_img, shrink=.75) ax.xaxis.set_ticks_position("bottom") ax.set_xticklabels(names, rotation=90) ax.set_yticklabels(names) minor_ticks = np.linspace(-.5, nvars - 1.5, nvars) ax.set_xticks(minor_ticks, True) ax.set_yticks(minor_ticks, True) major_ticks = np.linspace(0, nvars - 1, nvars) ax.set_xticks(major_ticks) ax.set_yticks(major_ticks) ax.grid(False, which="major") ax.grid(True, which="minor", linestyle="-") return ax fig, ax = plt.subplots(figsize=(18,18)) asymmatplot(support, names=support.columns, ax=ax, cmap='OrRd') trip_counts_df = df[['ORIGIN_STATE_ABR', 'DEST_STATE_ABR', 'FL_DATE']].groupby(['ORIGIN_STATE_ABR', 'DEST_STATE_ABR']).count() delay_counts_df = delay_counts_df.rename_axis({'ARR_DEL15' : 'COUNTS'}, axis=1) trip_counts_df = trip_counts_df.rename_axis({'FL_DATE' : 'COUNTS'}, axis=1) mat = (delay_counts_df / trip_counts_df).unstack().T.reset_index(level=0, drop=True).T fig, ax = plt.subplots(figsize=(18,18)) asymmatplot(mat, names=mat.columns, ax=ax, cmap='OrRd', cmap_range=(0., 1.0)) print delay_counts_df.loc['RI', 'CO'] print trip_counts_df.loc['RI', 'CO'] fig, ax = plt.subplots(figsize=(18,10)) sns.boxplot(df.ARR_DELAY_NEW, df.FL_DATE, ax=ax) fig.autofmt_xdate() fig, ax = plt.subplots(figsize=(18,10)) sns.boxplot(df.ARR_DELAY_NEW, df.FL_DATE, ax=ax, showfliers=False) fig.autofmt_xdate() !cal 6 2014