#!/usr/bin/env python
# coding: utf-8
# # Importing Data
# In[1]:
import numpy as np
import pandas as pd
pd.options.display.max_rows = 6
pd.options.display.max_columns = 8
pd.__version__
# We often times have a variety of input data.
#
# - CSV
# - Excel
# - SQL
# - JSON
# - HDF5
# - pickle
# - msgpack
# - Stata
# - BigQuery
# This is subset of the data from beeradvocate.com, via [Standford](https://snap.stanford.edu/data/web-RateBeer.html). It's strangely formatted.
#
# This dataset is no longer available!
#
#
# beer/name: Sausa Weizen
# beer/beerI: 47986
# beer/brewerId: 10325
# beer/ABV: 5.00
# beer/style: Hefeweizen
# review/appearance: 2.5
# review/aroma: 2
# review/time: 1234817823
# review/profileName: stcules
# review/text: A lot of foam. But a lot. In the smell some banana, and then lactic and tart. Not a good start. Quite dark orange in color, with a lively carbonation (now visible, under the foam). Again tending to lactic sourness. Same for the taste. With some yeast and banana.
#
# beer/name: Red Moon
# beer/beerId: 48213
# beer/brewerId: 10325
# beer/ABV: 6.20
# ...
#
#
# # CSV
# http://pandas.pydata.org/pandas-docs/stable/io.html#csv-text-files
# In[2]:
df = pd.read_csv('data/beer2.csv.gz',
index_col=0,
parse_dates=['time'],
encoding='utf-8')
# In[3]:
df
# In[4]:
df.info()
# In[5]:
# we have some unicode
df.loc[2,'beer_name']
# In[6]:
df.to_csv('data/beer.csv', index=False, encoding='utf-8')
# # Excel
# http://pandas.pydata.org/pandas-docs/stable/io.html#excel-files
# In[7]:
df.to_excel('data/beer.xls', index=False, encoding='utf-8')
# In[8]:
data = pd.read_excel('data/beer.xls', sheetnames=[0], encoding='utf-8')
# # SQL
# http://pandas.pydata.org/pandas-docs/stable/io.html#sql-queries
# In[9]:
from sqlalchemy import create_engine
get_ipython().system('rm -f data/beer.sqlite')
engine = create_engine('sqlite:///data/beer.sqlite')
# In[10]:
df.to_sql('table', engine)
# In[11]:
data = pd.read_sql('table', engine)
# # JSON
# http://pandas.pydata.org/pandas-docs/stable/io.html#json
# In[12]:
df.to_json('data/beer.json')
# In[13]:
data = pd.read_json('data/beer.json')
# # HDF
# http://pandas.pydata.org/pandas-docs/stable/io.html#hdf5-pytables
# In[14]:
# fixed format
df.to_hdf('data/beer_mixed.hdf',
'df',
mode='w',
format='fixed',
encoding='utf-8')
# In[15]:
df[df.text.isnull()]
# In[16]:
data = pd.read_hdf('data/beer.hdf','df',encoding='utf-8')
# In[17]:
# wildly varying strings
df.text.str.len().describe()
# # Timings
# In[18]:
get_ipython().run_line_magic('timeit', "pd.read_excel('data/beer.xls', sheetnames=[0])")
# In[19]:
get_ipython().run_line_magic('timeit', "pd.read_sql('table', engine)")
# In[20]:
get_ipython().run_line_magic('timeit', "pd.read_json('data/beer.json')")
# In[21]:
get_ipython().run_line_magic('timeit', "pd.read_csv('data/beer.csv', parse_dates=['time'])")
# In[22]:
get_ipython().run_line_magic('timeit', "pd.read_hdf('data/beer.hdf','df')")
# In[23]:
df.to_pickle('data/beer.pkl')
df.to_msgpack('data/beer.msgpack',encoding='utf-8')
# In[24]:
get_ipython().run_line_magic('timeit', "pd.read_pickle('data/beer.pkl')")
# In[25]:
get_ipython().run_line_magic('timeit', "pd.read_msgpack('data/beer.msgpack', encoding='utf-8')")
# # Storing Text vs Data
# http://matthewrocklin.com/blog/work/2015/03/16/Fast-Serialization/
# # Operating on Large Data
# In[26]:
chunks = pd.read_csv('data/beer2.csv.gz',
index_col=0,
parse_dates=['time'],
chunksize=10000)
for i, chunk in enumerate(chunks):
print("%d -> %d" % (i, len(chunk)))
# # Using Odo
# http://odo.readthedocs.org/
# # Questions
#
# - which formats provide good fidelity
# - hdf5, pickle, msgpack
#
# - which formats can you query
# - hdf5, sql
#
# - which formats can you iterate
# - csv, hdf5, sql
#
# - which formats provide better interoprability
# - csv, json, excel
#
# - which formats can you transmit over the wire
# - json, msgpack
#
# - which formats have better compression
# - hdf5, pickle, msgpack
#
# - which formats allow multiple datasets in the same file
# - hdf5, msgpack