%pylab --no-import-all inline import matplotlib.pyplot as plt import numpy as np from pylab import figure, show from pandas import DataFrame, Series import pandas as pd import os USAGOV_BITLY_PATH = os.path.join(os.pardir, "pydata-book", "ch02", "usagov_bitly_data2012-03-16-1331923249.txt") MOVIELENS_DIR = os.path.join(os.pardir, "pydata-book", "ch02", "movielens") NAMES_DIR = os.path.join(os.pardir, "pydata-book", "ch02", "names") assert os.path.exists(USAGOV_BITLY_PATH) assert os.path.exists(MOVIELENS_DIR) assert os.path.exists(NAMES_DIR) open(USAGOV_BITLY_PATH).readline() import json records = [json.loads(line) for line in open(USAGOV_BITLY_PATH)] # list comprehension len(records) # list of dict -> DataFrame frame = DataFrame(records) frame.head() # let's take a look at the data # my local dir: /Users/raymondyee/D/Document/Working_with_Open_Data/pydata-book/ch02/movielens !head $MOVIELENS_DIR/movies.dat # how many movies? !wc $MOVIELENS_DIR/movies.dat !head $MOVIELENS_DIR/users.dat !head $MOVIELENS_DIR/ratings.dat import pandas as pd import os unames = ['user_id', 'gender', 'age', 'occupation', 'zip'] users = pd.read_table(os.path.join(MOVIELENS_DIR, 'users.dat'), sep='::', header=None, names=unames) rnames = ['user_id', 'movie_id', 'rating', 'timestamp'] ratings = pd.read_table(os.path.join(MOVIELENS_DIR, 'ratings.dat'), sep='::', header=None, names=rnames) mnames = ['movie_id', 'title', 'genres'] movies = pd.read_table(os.path.join(MOVIELENS_DIR, 'movies.dat'), sep='::', header=None, names=mnames, encoding='iso-8859-1') movies[:100] import traceback try: movies[:100] except: traceback.print_exc() # explicit encoding of movies file import pandas as pd import codecs unames = ['user_id', 'gender', 'age', 'occupation', 'zip'] users = pd.read_table(os.path.join(MOVIELENS_DIR, 'users.dat'), sep='::', header=None, names=unames) rnames = ['user_id', 'movie_id', 'rating', 'timestamp'] ratings = pd.read_table(os.path.join(MOVIELENS_DIR, 'ratings.dat'), sep='::', header=None, names=rnames) movies_file = codecs.open(os.path.join(MOVIELENS_DIR, 'movies.dat'), encoding='iso-8859-1') mnames = ['movie_id', 'title', 'genres'] movies = pd.read_table(movies_file, sep='::', header=None, names=mnames) movies[:100] users[:5] movies[:100] import codecs from itertools import islice fname = os.path.join(MOVIELENS_DIR, "movies.dat") f = codecs.open(fname, encoding='iso-8859-1') for line in islice(f,100): print line import pandas as pd import codecs movies_file = codecs.open(os.path.join(MOVIELENS_DIR, 'movies.dat'), encoding='iso-8859-1') mnames = ['movie_id', 'title', 'genres'] movies = pd.read_table(movies_file, sep='::', header=None, names=mnames) print (movies.ix[72]['title'] == u'Misérables, Les (1995)') import pandas as pd import codecs names1880_file = codecs.open(os.path.join(NAMES_DIR,'yob2010.txt'), encoding='iso-8859-1') names1880 = pd.read_csv(names1880_file, names=['name', 'sex', 'births']) names1880 # sort by name names1880.sort('births', ascending=False)[:10] names1880[names1880.sex == 'F'].sort('births', ascending=False)[:10] names1880['births'].plot() names1880['births'].count()