#!/usr/bin/env python # coding: utf-8 # # Importing Data # In[1]: import numpy as np import pandas as pd pd.options.display.max_rows = 6 pd.options.display.max_columns = 8 pd.__version__ # We often times have a variety of input data. # # - CSV # - Excel # - SQL # - JSON # - HDF5 # - pickle # - msgpack # - Stata # - BigQuery # This is subset of the data from beeradvocate.com, via [Standford](https://snap.stanford.edu/data/web-RateBeer.html). It's strangely formatted. # # This dataset is no longer available! # #

# beer/name: Sausa Weizen
# beer/beerI: 47986
# beer/brewerId: 10325
# beer/ABV: 5.00
# beer/style: Hefeweizen
# review/appearance: 2.5
# review/aroma: 2
# review/time: 1234817823
# review/profileName: stcules
# review/text: A lot of foam. But a lot. In the smell some banana, and then lactic and tart. Not a good start. Quite dark orange in color, with a lively carbonation (now visible, under the foam). Again tending to lactic sourness. Same for the taste. With some yeast and banana.
#
# beer/name: Red Moon
# beer/beerId: 48213
# beer/brewerId: 10325
# beer/ABV: 6.20
# ...
#

# # # CSV # http://pandas.pydata.org/pandas-docs/stable/io.html#csv-text-files # In[2]: df = pd.read_csv('data/beer2.csv.gz', index_col=0, parse_dates=['time'], encoding='utf-8') # In[3]: df # In[4]: df.info() # In[5]: # we have some unicode df.loc[2,'beer_name'] # In[6]: df.to_csv('data/beer.csv', index=False, encoding='utf-8') # # Excel # http://pandas.pydata.org/pandas-docs/stable/io.html#excel-files # In[7]: df.to_excel('data/beer.xls', index=False, encoding='utf-8') # In[8]: data = pd.read_excel('data/beer.xls', sheetnames=[0], encoding='utf-8') # # SQL # http://pandas.pydata.org/pandas-docs/stable/io.html#sql-queries # In[9]: from sqlalchemy import create_engine get_ipython().system('rm -f data/beer.sqlite') engine = create_engine('sqlite:///data/beer.sqlite') # In[10]: df.to_sql('table', engine) # In[11]: data = pd.read_sql('table', engine) # # JSON # http://pandas.pydata.org/pandas-docs/stable/io.html#json # In[12]: df.to_json('data/beer.json') # In[13]: data = pd.read_json('data/beer.json') # # HDF # http://pandas.pydata.org/pandas-docs/stable/io.html#hdf5-pytables # In[14]: # fixed format df.to_hdf('data/beer_mixed.hdf', 'df', mode='w', format='fixed', encoding='utf-8') # In[15]: df[df.text.isnull()] # In[16]: data = pd.read_hdf('data/beer.hdf','df',encoding='utf-8') # In[17]: # wildly varying strings df.text.str.len().describe() # # Timings # In[18]: get_ipython().run_line_magic('timeit', "pd.read_excel('data/beer.xls', sheetnames=[0])") # In[19]: get_ipython().run_line_magic('timeit', "pd.read_sql('table', engine)") # In[20]: get_ipython().run_line_magic('timeit', "pd.read_json('data/beer.json')") # In[21]: get_ipython().run_line_magic('timeit', "pd.read_csv('data/beer.csv', parse_dates=['time'])") # In[22]: get_ipython().run_line_magic('timeit', "pd.read_hdf('data/beer.hdf','df')") # In[23]: df.to_pickle('data/beer.pkl') df.to_msgpack('data/beer.msgpack',encoding='utf-8') # In[24]: get_ipython().run_line_magic('timeit', "pd.read_pickle('data/beer.pkl')") # In[25]: get_ipython().run_line_magic('timeit', "pd.read_msgpack('data/beer.msgpack', encoding='utf-8')") # # Storing Text vs Data # http://matthewrocklin.com/blog/work/2015/03/16/Fast-Serialization/ # # Operating on Large Data # In[26]: chunks = pd.read_csv('data/beer2.csv.gz', index_col=0, parse_dates=['time'], chunksize=10000) for i, chunk in enumerate(chunks): print("%d -> %d" % (i, len(chunk))) # # Using Odo # http://odo.readthedocs.org/ # # Questions # # - which formats provide good fidelity # - hdf5, pickle, msgpack # # - which formats can you query # - hdf5, sql # # - which formats can you iterate # - csv, hdf5, sql # # - which formats provide better interoprability # - csv, json, excel # # - which formats can you transmit over the wire # - json, msgpack # # - which formats have better compression # - hdf5, pickle, msgpack # # - which formats allow multiple datasets in the same file # - hdf5, msgpack