Chapter 2, 3 of PDA
import matplotlib.pyplot as plt
import numpy as np
from pylab import figure, show
from pandas import DataFrame, Series
import pandas as pd
To make it more practical for me to look at your homework, I'm again going to assume a relative placement of files. I placed the files from
https://github.com/pydata/pydata-book
in a local directory, which in my case is "/Users/raymondyee/D/Document/Working_with_Open_Data/pydata-book/"
and then symbolically linked (ln -s
) to the the pydata-book from the root directory of the working-open-data folder. i.e., on OS X
cd /Users/raymondyee/D/Document/Working_with_Open_Data/working-open-data
ln -s /Users/raymondyee/D/Document/Working_with_Open_Data/pydata-book/ pydata-book
That way the files from the pydata-book repository look like they sit in the working-open-data directory -- without having to actually copy the files.
With this arrangment, I should then be able to drop your notebook into my own notebooks directory and run them without having to mess around with paths.
import os
USAGOV_BITLY_PATH = os.path.join(os.pardir, "pydata-book", "ch02", "usagov_bitly_data2012-03-16-1331923249.txt")
MOVIELENS_DIR = os.path.join(os.pardir, "pydata-book", "ch02", "movielens")
NAMES_DIR = os.path.join(os.pardir, "pydata-book", "ch02", "names")
assert os.path.exists(USAGOV_BITLY_PATH)
assert os.path.exists(MOVIELENS_DIR)
assert os.path.exists(NAMES_DIR)
Please make sure the above assertions work
(PfDA
, p. 18)
What's in the data file?
In 2011, URL shortening service bit.ly partnered with the United States government website usa.gov to provide a feed of anonymous data gathered from users who shorten links ending with .gov or .mil.
Hourly archive of data: http://bitly.measuredvoice.com/bitly_archive/?C=M;O=D
open(USAGOV_BITLY_PATH).readline()
'{ "a": "Mozilla\\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\\/535.11 (KHTML, like Gecko) Chrome\\/17.0.963.78 Safari\\/535.11", "c": "US", "nk": 1, "tz": "America\\/New_York", "gr": "MA", "g": "A6qOVH", "h": "wfLQtf", "l": "orofrog", "al": "en-US,en;q=0.8", "hh": "1.usa.gov", "r": "http:\\/\\/www.facebook.com\\/l\\/7AQEFzjSi\\/1.usa.gov\\/wfLQtf", "u": "http:\\/\\/www.ncbi.nlm.nih.gov\\/pubmed\\/22415991", "t": 1331923247, "hc": 1331822918, "cy": "Danvers", "ll": [ 42.576698, -70.954903 ] }\n'
import json
records = [json.loads(line) for line in open(USAGOV_BITLY_PATH)] # list comprehension
Recall what records
is
len(records)
3560
# list of dict -> DataFrame
frame = DataFrame(records)
frame
<class 'pandas.core.frame.DataFrame'> Int64Index: 3560 entries, 0 to 3559 Data columns: _heartbeat_ 120 non-null values a 3440 non-null values al 3094 non-null values c 2919 non-null values cy 2919 non-null values g 3440 non-null values gr 2919 non-null values h 3440 non-null values hc 3440 non-null values hh 3440 non-null values kw 93 non-null values l 3440 non-null values ll 2919 non-null values nk 3440 non-null values r 3440 non-null values t 3440 non-null values tz 3440 non-null values u 3440 non-null values dtypes: float64(4), object(14)
tz_counts = frame['tz'].value_counts()
tz_counts[:10]
America/New_York 1251 521 America/Chicago 400 America/Los_Angeles 382 America/Denver 191 Europe/London 74 Asia/Tokyo 37 Pacific/Honolulu 36 Europe/Madrid 35 America/Sao_Paulo 33
# fillna
clean_tz = frame['tz'].fillna('Missing')
tz_counts = clean_tz.value_counts()
print tz_counts[:10]
America/New_York 1251 521 America/Chicago 400 America/Los_Angeles 382 America/Denver 191 Missing 120 Europe/London 74 Asia/Tokyo 37 Pacific/Honolulu 36 Europe/Madrid 35
(clean_tz == '').value_counts()
False 3039 True 521
# '' -> 'Unknown'
clean_tz[clean_tz == ''] = 'Unknown'
tz_counts = clean_tz.value_counts()
tz_counts[:10]
America/New_York 1251 Unknown 521 America/Chicago 400 America/Los_Angeles 382 America/Denver 191 Missing 120 Europe/London 74 Asia/Tokyo 37 Pacific/Honolulu 36 Europe/Madrid 35
frame['a'][1]
frame['a'][50]
frame['a'][51]
u'Mozilla/5.0 (Linux; U; Android 2.2.2; en-us; LG-P925/V10e Build/FRG83G) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1'
tz_counts[:10].plot(kind='barh', rot=0)
<matplotlib.axes.AxesSubplot at 0x26ee5d0>
results = Series([x.split()[0] for x in frame.a.dropna()])
results[:5]
0 Mozilla/5.0 1 GoogleMaps/RochesterNY 2 Mozilla/4.0 3 Mozilla/5.0 4 Mozilla/5.0
results.value_counts()[:8]
Mozilla/5.0 2594 Mozilla/4.0 601 GoogleMaps/RochesterNY 121 Opera/9.80 34 TEST_INTERNET_AGENT 24 GoogleProducer 21 Mozilla/6.0 5 BlackBerry8520/5.0.0.681 4
frame.a.notnull()
0 True 1 True 2 True 3 True 4 True 5 True 6 True 7 True 8 True 9 True 10 True 11 True 12 True 13 False 14 True ... 3545 True 3546 True 3547 True 3548 True 3549 True 3550 True 3551 True 3552 True 3553 True 3554 True 3555 True 3556 True 3557 True 3558 True 3559 True Length: 3560
frame[frame.a.notnull()]
<class 'pandas.core.frame.DataFrame'> Int64Index: 3440 entries, 0 to 3559 Data columns: _heartbeat_ 0 non-null values a 3440 non-null values al 3094 non-null values c 2919 non-null values cy 2919 non-null values g 3440 non-null values gr 2919 non-null values h 3440 non-null values hc 3440 non-null values hh 3440 non-null values kw 93 non-null values l 3440 non-null values ll 2919 non-null values nk 3440 non-null values r 3440 non-null values t 3440 non-null values tz 3440 non-null values u 3440 non-null values dtypes: float64(4), object(14)
cframe = frame[frame.a.notnull()]
meaning of other attributes?
http://www.usa.gov/About/developer-resources/1usagov.shtml#data
frame.ll.notnull()
0 True 1 True 2 True 3 True 4 True 5 True 6 True 7 False 8 False 9 False 10 True 11 True 12 True 13 False 14 True ... 3545 True 3546 True 3547 True 3548 True 3549 True 3550 True 3551 False 3552 True 3553 True 3554 True 3555 True 3556 True 3557 True 3558 True 3559 True Length: 3560
Hints:
apply
on Series
# Naive scatter plot
scatter(frame[frame.ll.notnull()]['ll'].apply(lambda x: x[1]), frame[frame.ll.notnull()]['ll'].apply(lambda x: x[0]), 3, color="g" )
<matplotlib.collections.PathCollection at 0x59947f0>
from itertools import izip
dots = izip(frame[frame.ll.notnull()]['ll'].apply(lambda x: x[1]).values,
frame[frame.ll.notnull()]['ll'].apply(lambda x: x[0]))
# http://matplotlib.org/basemap/users/mapcoords.html
from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt
import numpy as np
# setup Lambert Conformal basemap.
m = Basemap(projection='merc',llcrnrlat=-80,urcrnrlat=80,\
llcrnrlon=-180,urcrnrlon=180,lat_ts=20,resolution='c')
# draw a boundary around the map, fill the background.
# this background will end up being the ocean color, since
# the continents will be drawn on top.
m.drawmapboundary(fill_color='aqua')
# fill continents, set lake color same as ocean color.
m.fillcontinents(color='coral',lake_color='aqua')
# draw state boundaries too
# http://matplotlib.org/basemap/api/basemap_api.html#mpl_toolkits.basemap.Basemap.drawstates
m.drawstates(linewidth=0.1)
m.drawparallels(np.arange(-90.,91.,30.))
m.drawmeridians(np.arange(-180.,181.,60.))
# plot blue dot on Boulder, colorado and label it as such.
lon, lat = -104.237, 40.125 # Location of Boulder
# convert to map projection coords.
# Note that lon,lat can be scalars, lists or numpy arrays.
xpt,ypt = m(lon,lat)
# convert back to lat/lon
lonpt, latpt = m(xpt,ypt,inverse=True)
m.plot(xpt,ypt,'bo',markersize=5) # plot a blue dot there
from itertools import izip
dots = izip(frame[frame.ll.notnull()]['ll'].apply(lambda x: x[1]).values,
frame[frame.ll.notnull()]['ll'].apply(lambda x: x[0]))
for dot in dots:
(xpt, ypt) = m(dot[0], dot[1])
m.plot(xpt, ypt, 'bo', markersize=2)
# http://matplotlib.org/basemap/users/mapcoords.html
from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt
import numpy as np
# setup Lambert Conformal basemap.
m = Basemap(projection='merc',llcrnrlat=-80,urcrnrlat=80,\
llcrnrlon=-180,urcrnrlon=180,lat_ts=20,resolution='c')
# draw a boundary around the map, fill the background.
# this background will end up being the ocean color, since
# the continents will be drawn on top.
m.drawmapboundary(fill_color='aqua')
# fill continents, set lake color same as ocean color.
m.fillcontinents(color='coral',lake_color='aqua')
# draw state boundaries too
# http://matplotlib.org/basemap/api/basemap_api.html#mpl_toolkits.basemap.Basemap.drawstates
m.drawstates(linewidth=0.1)
m.drawparallels(np.arange(-90.,91.,30.))
m.drawmeridians(np.arange(-180.,181.,60.))
# plot blue dot on Boulder, colorado and label it as such.
lon, lat = -104.237, 40.125 # Location of Boulder
# convert to map projection coords.
# Note that lon,lat can be scalars, lists or numpy arrays.
xpt,ypt = m(lon,lat)
# convert back to lat/lon
lonpt, latpt = m(xpt,ypt,inverse=True)
m.plot(xpt,ypt,'bo',markersize=5) # plot a blue dot there
(xpt, ypt) = m(frame[frame.ll.notnull()]['ll'].apply(lambda x: x[1]),
frame[frame.ll.notnull()]['ll'].apply(lambda x: x[0]))
m.plot(xpt, ypt, 'bo', markersize=2)
# the following doesn't work for some reason....
#m.scatter(frame[frame.ll.notnull()]['ll'].apply(lambda x: x[1]), frame[frame.ll.notnull()]['ll'].apply(lambda x: x[0]), 3, color="g" )
# put some text next to the dot, offset a little bit
# (the offset is in map projection coordinates)
[<matplotlib.lines.Line2D at 0xa5032b0>]
timestamp -- range -- when exactly?
histogram?
frame.t.describe()
count 3.440000e+03 mean 1.331925e+09 std 1.019673e+03 min 1.331923e+09 25% 1.331924e+09 50% 1.331925e+09 75% 1.331926e+09 max 1.331927e+09
frame.t.dropna().apply(datetime.datetime.fromtimestamp)
0 2012-03-16 11:40:47 1 2012-03-16 11:40:49 2 2012-03-16 11:40:50 3 2012-03-16 11:40:49 4 2012-03-16 11:40:51 5 2012-03-16 11:40:52 6 2012-03-16 11:40:55 7 2012-03-16 11:40:55 8 2012-03-16 11:40:54 9 2012-03-16 11:40:55 10 2012-03-16 11:40:58 11 2012-03-16 11:40:59 12 2012-03-16 11:40:59 14 2012-03-16 11:41:02 15 2012-03-16 11:41:03 ... 3545 2012-03-16 12:40:32 3546 2012-03-16 12:40:33 3547 2012-03-16 12:40:36 3548 2012-03-16 12:40:36 3549 2012-03-16 12:40:34 3550 2012-03-16 12:40:37 3551 2012-03-16 12:40:37 3552 2012-03-16 12:40:39 3553 2012-03-16 12:40:40 3554 2012-03-16 12:40:40 3555 2012-03-16 12:40:41 3556 2012-03-16 12:40:44 3557 2012-03-16 12:40:46 3558 2012-03-16 12:40:47 3559 2012-03-16 12:40:49 Name: t, Length: 3440
# FILL IN
earliest_dt = datetime.datetime.fromtimestamp(frame.t.min())
assert earliest_dt == datetime.datetime(2012, 3, 16, 11, 40, 47)
latest_dt = datetime.datetime.fromtimestamp(frame.t.max())
assert latest_dt == datetime.datetime(2012, 3, 16, 12, 40, 49)
Hints:
netlocs
as a Series, indexed by Network location part (http://docs.python.org/2/library/urlparse.html) of frame.u
, and holding the number of times that netloc occurs in frame.u
frame.u
0 http://www.ncbi.nlm.nih.gov/pubmed/22415991 1 http://www.monroecounty.gov/etc/911/rss.php 2 http://boxer.senate.gov/en/press/releases/0316... 3 http://apod.nasa.gov/apod/ap120312.html 4 http://www.shrewsbury-ma.gov/egov/gallery/1341... 5 http://www.shrewsbury-ma.gov/egov/gallery/1341... 6 http://www.nasa.gov/mission_pages/nustar/main/... 7 http://www.nasa.gov/mission_pages/nustar/main/... 8 http://www.nasa.gov/mission_pages/nustar/main/... 9 http://apod.nasa.gov/apod/ap120312.html 10 https://www.nysdot.gov/rexdesign/design/commun... 11 http://oversight.house.gov/wp-content/uploads/... 12 https://www.nysdot.gov/rexdesign/design/commun... 13 NaN 14 http://toxtown.nlm.nih.gov/index.php ... 3545 https://www.nysdot.gov/rexdesign/design/commun... 3546 http://healthypeople.gov/2020/connect/webinars... 3547 http://www.epa.gov/otaq/regs/fuels/additive/e1... 3548 http://www.fda.gov/Safety/Recalls/ucm296326.htm 3549 http://www.nasa.gov/mission_pages/WISE/main/in... 3550 http://www.nlm.nih.gov/medlineplus/news/fullst... 3551 http://www.nasa.gov/mission_pages/nustar/main/... 3552 http://travel.state.gov/passport/passport_5535... 3553 http://www.shrewsbury-ma.gov/egov/gallery/1341... 3554 http://www.shrewsbury-ma.gov/egov/gallery/1341... 3555 http://www.fda.gov/AdvisoryCommittees/Committe... 3556 http://www.okc.gov/PublicNotificationSystem/Fo... 3557 http://www.monroecounty.gov/etc/911/rss.php 3558 http://www.ahrq.gov/qual/qitoolkit/ 3559 http://herndon-va.gov/Content/public_safety/Pu... Name: u, Length: 3560
# FILL IN
#http://docs.python.org/2/library/urlparse.html
from urlparse import urlparse
s = frame.u.dropna().apply(lambda u: urlparse(u)[1])
netlocs = s.groupby(s).size().order()[::-1]
# https://github.com/pydata/pandas/issues/240
assert isinstance(netlocs, Series)
assert set(list(netlocs[:5].iteritems())) == set([(u'www.whitehouse.gov', 169),
(u'www.monroecounty.gov', 121),
(u'www.fda.gov', 112),
(u'www.nasa.gov', 733),
(u'www.nysdot.gov', 836)])
PDA p. 26
http://www.grouplens.org/node/73 --> there's also a 10 million ratings dataset -- would be interesting to try out to test scalability of running IPython notebook on laptop
# let's take a look at the data
# my local dir: /Users/raymondyee/D/Document/Working_with_Open_Data/pydata-book/ch02/movielens
!head $MOVIELENS_DIR/movies.dat
1::Toy Story (1995)::Animation|Children's|Comedy 2::Jumanji (1995)::Adventure|Children's|Fantasy 3::Grumpier Old Men (1995)::Comedy|Romance 4::Waiting to Exhale (1995)::Comedy|Drama 5::Father of the Bride Part II (1995)::Comedy 6::Heat (1995)::Action|Crime|Thriller 7::Sabrina (1995)::Comedy|Romance 8::Tom and Huck (1995)::Adventure|Children's 9::Sudden Death (1995)::Action 10::GoldenEye (1995)::Action|Adventure|Thriller
# how many movies?
!wc $MOVIELENS_DIR/movies.dat
3883 15675 171308 ../pydata-book/ch02/movielens/movies.dat
!head $MOVIELENS_DIR/users.dat
1::F::1::10::48067 2::M::56::16::70072 3::M::25::15::55117 4::M::45::7::02460 5::M::25::20::55455 6::F::50::9::55117 7::M::35::1::06810 8::M::25::12::11413 9::M::25::17::61614 10::F::35::1::95370
!head $MOVIELENS_DIR/ratings.dat
1::1193::5::978300760 1::661::3::978302109 1::914::3::978301968 1::3408::4::978300275 1::2355::5::978824291 1::1197::3::978302268 1::1287::5::978302039 1::2804::5::978300719 1::594::4::978302268 1::919::4::978301368
import pandas as pd
import os
unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_table(os.path.join(MOVIELENS_DIR, 'users.dat'), sep='::', header=None,
names=unames)
rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table(os.path.join(MOVIELENS_DIR, 'ratings.dat'), sep='::', header=None,
names=rnames)
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table(os.path.join(MOVIELENS_DIR, 'movies.dat'), sep='::', header=None,
names=mnames, encoding='iso-8859-1')
movies[:100]
movie_id | title | genres | |
---|---|---|---|
0 | 1 | Toy Story (1995) | Animation|Children's|Comedy |
1 | 2 | Jumanji (1995) | Adventure|Children's|Fantasy |
2 | 3 | Grumpier Old Men (1995) | Comedy|Romance |
3 | 4 | Waiting to Exhale (1995) | Comedy|Drama |
4 | 5 | Father of the Bride Part II (1995) | Comedy |
5 | 6 | Heat (1995) | Action|Crime|Thriller |
6 | 7 | Sabrina (1995) | Comedy|Romance |
7 | 8 | Tom and Huck (1995) | Adventure|Children's |
8 | 9 | Sudden Death (1995) | Action |
9 | 10 | GoldenEye (1995) | Action|Adventure|Thriller |
10 | 11 | American President, The (1995) | Comedy|Drama|Romance |
11 | 12 | Dracula: Dead and Loving It (1995) | Comedy|Horror |
12 | 13 | Balto (1995) | Animation|Children's |
13 | 14 | Nixon (1995) | Drama |
14 | 15 | Cutthroat Island (1995) | Action|Adventure|Romance |
15 | 16 | Casino (1995) | Drama|Thriller |
16 | 17 | Sense and Sensibility (1995) | Drama|Romance |
17 | 18 | Four Rooms (1995) | Thriller |
18 | 19 | Ace Ventura: When Nature Calls (1995) | Comedy |
19 | 20 | Money Train (1995) | Action |
20 | 21 | Get Shorty (1995) | Action|Comedy|Drama |
21 | 22 | Copycat (1995) | Crime|Drama|Thriller |
22 | 23 | Assassins (1995) | Thriller |
23 | 24 | Powder (1995) | Drama|Sci-Fi |
24 | 25 | Leaving Las Vegas (1995) | Drama|Romance |
25 | 26 | Othello (1995) | Drama |
26 | 27 | Now and Then (1995) | Drama |
27 | 28 | Persuasion (1995) | Romance |
28 | 29 | City of Lost Children, The (1995) | Adventure|Sci-Fi |
29 | 30 | Shanghai Triad (Yao a yao yao dao waipo qiao) ... | Drama |
30 | 31 | Dangerous Minds (1995) | Drama |
31 | 32 | Twelve Monkeys (1995) | Drama|Sci-Fi |
32 | 33 | Wings of Courage (1995) | Adventure|Romance |
33 | 34 | Babe (1995) | Children's|Comedy|Drama |
34 | 35 | Carrington (1995) | Drama|Romance |
35 | 36 | Dead Man Walking (1995) | Drama |
36 | 37 | Across the Sea of Time (1995) | Documentary |
37 | 38 | It Takes Two (1995) | Comedy |
38 | 39 | Clueless (1995) | Comedy|Romance |
39 | 40 | Cry, the Beloved Country (1995) | Drama |
40 | 41 | Richard III (1995) | Drama|War |
41 | 42 | Dead Presidents (1995) | Action|Crime|Drama |
42 | 43 | Restoration (1995) | Drama |
43 | 44 | Mortal Kombat (1995) | Action|Adventure |
44 | 45 | To Die For (1995) | Comedy|Drama |
45 | 46 | How to Make an American Quilt (1995) | Drama|Romance |
46 | 47 | Seven (Se7en) (1995) | Crime|Thriller |
47 | 48 | Pocahontas (1995) | Animation|Children's|Musical|Romance |
48 | 49 | When Night Is Falling (1995) | Drama|Romance |
49 | 50 | Usual Suspects, The (1995) | Crime|Thriller |
50 | 51 | Guardian Angel (1994) | Action|Drama|Thriller |
51 | 52 | Mighty Aphrodite (1995) | Comedy |
52 | 53 | Lamerica (1994) | Drama |
53 | 54 | Big Green, The (1995) | Children's|Comedy |
54 | 55 | Georgia (1995) | Drama |
55 | 56 | Kids of the Round Table (1995) | Adventure|Children's|Fantasy |
56 | 57 | Home for the Holidays (1995) | Drama |
57 | 58 | Postino, Il (The Postman) (1994) | Drama|Romance |
58 | 59 | Confessional, The (Le Confessionnal) (1995) | Drama|Mystery |
59 | 60 | Indian in the Cupboard, The (1995) | Adventure|Children's|Fantasy |
60 | 61 | Eye for an Eye (1996) | Drama|Thriller |
61 | 62 | Mr. Holland's Opus (1995) | Drama |
62 | 63 | Don't Be a Menace to South Central While Drink... | Comedy |
63 | 64 | Two if by Sea (1996) | Comedy|Romance |
64 | 65 | Bio-Dome (1996) | Comedy |
65 | 66 | Lawnmower Man 2: Beyond Cyberspace (1996) | Sci-Fi|Thriller |
66 | 67 | Two Bits (1995) | Drama |
67 | 68 | French Twist (Gazon maudit) (1995) | Comedy|Romance |
68 | 69 | Friday (1995) | Comedy |
69 | 70 | From Dusk Till Dawn (1996) | Action|Comedy|Crime|Horror|Thriller |
70 | 71 | Fair Game (1995) | Action |
71 | 72 | Kicking and Screaming (1995) | Comedy|Drama |
72 | 73 | Mis�rables, Les (1995) | Drama|Musical |
73 | 74 | Bed of Roses (1996) | Drama|Romance |
74 | 75 | Big Bully (1996) | Comedy|Drama |
75 | 76 | Screamers (1995) | Sci-Fi|Thriller |
76 | 77 | Nico Icon (1995) | Documentary |
77 | 78 | Crossing Guard, The (1995) | Drama |
78 | 79 | Juror, The (1996) | Drama|Thriller |
79 | 80 | White Balloon, The (Badkonake Sefid ) (1995) | Drama |
80 | 81 | Things to Do in Denver when You're Dead (1995) | Crime|Drama|Romance |
81 | 82 | Antonia's Line (Antonia) (1995) | Drama |
82 | 83 | Once Upon a Time... When We Were Colored (1995) | Drama |
83 | 84 | Last Summer in the Hamptons (1995) | Comedy|Drama |
84 | 85 | Angels and Insects (1995) | Drama|Romance |
85 | 86 | White Squall (1996) | Adventure|Drama |
86 | 87 | Dunston Checks In (1996) | Children's|Comedy |
87 | 88 | Black Sheep (1996) | Comedy |
88 | 89 | Nick of Time (1995) | Action|Thriller |
89 | 90 | Journey of August King, The (1995) | Drama |
90 | 92 | Mary Reilly (1996) | Drama|Thriller |
91 | 93 | Vampire in Brooklyn (1995) | Comedy|Romance |
92 | 94 | Beautiful Girls (1996) | Drama |
93 | 95 | Broken Arrow (1996) | Action|Thriller |
94 | 96 | In the Bleak Midwinter (1995) | Comedy |
95 | 97 | Hate (Haine, La) (1995) | Drama |
96 | 98 | Shopping (1994) | Action|Thriller |
97 | 99 | Heidi Fleiss: Hollywood Madam (1995) | Documentary |
98 | 100 | City Hall (1996) | Drama|Thriller |
99 | 101 | Bottle Rocket (1996) | Comedy |
import traceback
try:
movies[:100]
except:
traceback.print_exc()
# explicit encoding of movies file
import pandas as pd
import codecs
unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_table(os.path.join(MOVIELENS_DIR, 'users.dat'), sep='::', header=None,
names=unames)
rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table(os.path.join(MOVIELENS_DIR, 'ratings.dat'), sep='::', header=None,
names=rnames)
movies_file = codecs.open(os.path.join(MOVIELENS_DIR, 'movies.dat'), encoding='iso-8859-1')
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table(movies_file, sep='::', header=None,
names=mnames)
movies[:100]
movie_id | title | genres | |
---|---|---|---|
0 | 1 | Toy Story (1995) | Animation|Children's|Comedy |
1 | 2 | Jumanji (1995) | Adventure|Children's|Fantasy |
2 | 3 | Grumpier Old Men (1995) | Comedy|Romance |
3 | 4 | Waiting to Exhale (1995) | Comedy|Drama |
4 | 5 | Father of the Bride Part II (1995) | Comedy |
5 | 6 | Heat (1995) | Action|Crime|Thriller |
6 | 7 | Sabrina (1995) | Comedy|Romance |
7 | 8 | Tom and Huck (1995) | Adventure|Children's |
8 | 9 | Sudden Death (1995) | Action |
9 | 10 | GoldenEye (1995) | Action|Adventure|Thriller |
10 | 11 | American President, The (1995) | Comedy|Drama|Romance |
11 | 12 | Dracula: Dead and Loving It (1995) | Comedy|Horror |
12 | 13 | Balto (1995) | Animation|Children's |
13 | 14 | Nixon (1995) | Drama |
14 | 15 | Cutthroat Island (1995) | Action|Adventure|Romance |
15 | 16 | Casino (1995) | Drama|Thriller |
16 | 17 | Sense and Sensibility (1995) | Drama|Romance |
17 | 18 | Four Rooms (1995) | Thriller |
18 | 19 | Ace Ventura: When Nature Calls (1995) | Comedy |
19 | 20 | Money Train (1995) | Action |
20 | 21 | Get Shorty (1995) | Action|Comedy|Drama |
21 | 22 | Copycat (1995) | Crime|Drama|Thriller |
22 | 23 | Assassins (1995) | Thriller |
23 | 24 | Powder (1995) | Drama|Sci-Fi |
24 | 25 | Leaving Las Vegas (1995) | Drama|Romance |
25 | 26 | Othello (1995) | Drama |
26 | 27 | Now and Then (1995) | Drama |
27 | 28 | Persuasion (1995) | Romance |
28 | 29 | City of Lost Children, The (1995) | Adventure|Sci-Fi |
29 | 30 | Shanghai Triad (Yao a yao yao dao waipo qiao) ... | Drama |
30 | 31 | Dangerous Minds (1995) | Drama |
31 | 32 | Twelve Monkeys (1995) | Drama|Sci-Fi |
32 | 33 | Wings of Courage (1995) | Adventure|Romance |
33 | 34 | Babe (1995) | Children's|Comedy|Drama |
34 | 35 | Carrington (1995) | Drama|Romance |
35 | 36 | Dead Man Walking (1995) | Drama |
36 | 37 | Across the Sea of Time (1995) | Documentary |
37 | 38 | It Takes Two (1995) | Comedy |
38 | 39 | Clueless (1995) | Comedy|Romance |
39 | 40 | Cry, the Beloved Country (1995) | Drama |
40 | 41 | Richard III (1995) | Drama|War |
41 | 42 | Dead Presidents (1995) | Action|Crime|Drama |
42 | 43 | Restoration (1995) | Drama |
43 | 44 | Mortal Kombat (1995) | Action|Adventure |
44 | 45 | To Die For (1995) | Comedy|Drama |
45 | 46 | How to Make an American Quilt (1995) | Drama|Romance |
46 | 47 | Seven (Se7en) (1995) | Crime|Thriller |
47 | 48 | Pocahontas (1995) | Animation|Children's|Musical|Romance |
48 | 49 | When Night Is Falling (1995) | Drama|Romance |
49 | 50 | Usual Suspects, The (1995) | Crime|Thriller |
50 | 51 | Guardian Angel (1994) | Action|Drama|Thriller |
51 | 52 | Mighty Aphrodite (1995) | Comedy |
52 | 53 | Lamerica (1994) | Drama |
53 | 54 | Big Green, The (1995) | Children's|Comedy |
54 | 55 | Georgia (1995) | Drama |
55 | 56 | Kids of the Round Table (1995) | Adventure|Children's|Fantasy |
56 | 57 | Home for the Holidays (1995) | Drama |
57 | 58 | Postino, Il (The Postman) (1994) | Drama|Romance |
58 | 59 | Confessional, The (Le Confessionnal) (1995) | Drama|Mystery |
59 | 60 | Indian in the Cupboard, The (1995) | Adventure|Children's|Fantasy |
60 | 61 | Eye for an Eye (1996) | Drama|Thriller |
61 | 62 | Mr. Holland's Opus (1995) | Drama |
62 | 63 | Don't Be a Menace to South Central While Drink... | Comedy |
63 | 64 | Two if by Sea (1996) | Comedy|Romance |
64 | 65 | Bio-Dome (1996) | Comedy |
65 | 66 | Lawnmower Man 2: Beyond Cyberspace (1996) | Sci-Fi|Thriller |
66 | 67 | Two Bits (1995) | Drama |
67 | 68 | French Twist (Gazon maudit) (1995) | Comedy|Romance |
68 | 69 | Friday (1995) | Comedy |
69 | 70 | From Dusk Till Dawn (1996) | Action|Comedy|Crime|Horror|Thriller |
70 | 71 | Fair Game (1995) | Action |
71 | 72 | Kicking and Screaming (1995) | Comedy|Drama |
72 | 73 | Misérables, Les (1995) | Drama|Musical |
73 | 74 | Bed of Roses (1996) | Drama|Romance |
74 | 75 | Big Bully (1996) | Comedy|Drama |
75 | 76 | Screamers (1995) | Sci-Fi|Thriller |
76 | 77 | Nico Icon (1995) | Documentary |
77 | 78 | Crossing Guard, The (1995) | Drama |
78 | 79 | Juror, The (1996) | Drama|Thriller |
79 | 80 | White Balloon, The (Badkonake Sefid ) (1995) | Drama |
80 | 81 | Things to Do in Denver when You're Dead (1995) | Crime|Drama|Romance |
81 | 82 | Antonia's Line (Antonia) (1995) | Drama |
82 | 83 | Once Upon a Time... When We Were Colored (1995) | Drama |
83 | 84 | Last Summer in the Hamptons (1995) | Comedy|Drama |
84 | 85 | Angels and Insects (1995) | Drama|Romance |
85 | 86 | White Squall (1996) | Adventure|Drama |
86 | 87 | Dunston Checks In (1996) | Children's|Comedy |
87 | 88 | Black Sheep (1996) | Comedy |
88 | 89 | Nick of Time (1995) | Action|Thriller |
89 | 90 | Journey of August King, The (1995) | Drama |
90 | 92 | Mary Reilly (1996) | Drama|Thriller |
91 | 93 | Vampire in Brooklyn (1995) | Comedy|Romance |
92 | 94 | Beautiful Girls (1996) | Drama |
93 | 95 | Broken Arrow (1996) | Action|Thriller |
94 | 96 | In the Bleak Midwinter (1995) | Comedy |
95 | 97 | Hate (Haine, La) (1995) | Drama |
96 | 98 | Shopping (1994) | Action|Thriller |
97 | 99 | Heidi Fleiss: Hollywood Madam (1995) | Documentary |
98 | 100 | City Hall (1996) | Drama|Thriller |
99 | 101 | Bottle Rocket (1996) | Comedy |
users[:5]
user_id | gender | age | occupation | zip | |
---|---|---|---|---|---|
0 | 1 | F | 1 | 10 | 48067 |
1 | 2 | M | 56 | 16 | 70072 |
2 | 3 | M | 25 | 15 | 55117 |
3 | 4 | M | 45 | 7 | 02460 |
4 | 5 | M | 25 | 20 | 55455 |
movies[:100]
movie_id | title | genres | |
---|---|---|---|
0 | 1 | Toy Story (1995) | Animation|Children's|Comedy |
1 | 2 | Jumanji (1995) | Adventure|Children's|Fantasy |
2 | 3 | Grumpier Old Men (1995) | Comedy|Romance |
3 | 4 | Waiting to Exhale (1995) | Comedy|Drama |
4 | 5 | Father of the Bride Part II (1995) | Comedy |
5 | 6 | Heat (1995) | Action|Crime|Thriller |
6 | 7 | Sabrina (1995) | Comedy|Romance |
7 | 8 | Tom and Huck (1995) | Adventure|Children's |
8 | 9 | Sudden Death (1995) | Action |
9 | 10 | GoldenEye (1995) | Action|Adventure|Thriller |
10 | 11 | American President, The (1995) | Comedy|Drama|Romance |
11 | 12 | Dracula: Dead and Loving It (1995) | Comedy|Horror |
12 | 13 | Balto (1995) | Animation|Children's |
13 | 14 | Nixon (1995) | Drama |
14 | 15 | Cutthroat Island (1995) | Action|Adventure|Romance |
15 | 16 | Casino (1995) | Drama|Thriller |
16 | 17 | Sense and Sensibility (1995) | Drama|Romance |
17 | 18 | Four Rooms (1995) | Thriller |
18 | 19 | Ace Ventura: When Nature Calls (1995) | Comedy |
19 | 20 | Money Train (1995) | Action |
20 | 21 | Get Shorty (1995) | Action|Comedy|Drama |
21 | 22 | Copycat (1995) | Crime|Drama|Thriller |
22 | 23 | Assassins (1995) | Thriller |
23 | 24 | Powder (1995) | Drama|Sci-Fi |
24 | 25 | Leaving Las Vegas (1995) | Drama|Romance |
25 | 26 | Othello (1995) | Drama |
26 | 27 | Now and Then (1995) | Drama |
27 | 28 | Persuasion (1995) | Romance |
28 | 29 | City of Lost Children, The (1995) | Adventure|Sci-Fi |
29 | 30 | Shanghai Triad (Yao a yao yao dao waipo qiao) ... | Drama |
30 | 31 | Dangerous Minds (1995) | Drama |
31 | 32 | Twelve Monkeys (1995) | Drama|Sci-Fi |
32 | 33 | Wings of Courage (1995) | Adventure|Romance |
33 | 34 | Babe (1995) | Children's|Comedy|Drama |
34 | 35 | Carrington (1995) | Drama|Romance |
35 | 36 | Dead Man Walking (1995) | Drama |
36 | 37 | Across the Sea of Time (1995) | Documentary |
37 | 38 | It Takes Two (1995) | Comedy |
38 | 39 | Clueless (1995) | Comedy|Romance |
39 | 40 | Cry, the Beloved Country (1995) | Drama |
40 | 41 | Richard III (1995) | Drama|War |
41 | 42 | Dead Presidents (1995) | Action|Crime|Drama |
42 | 43 | Restoration (1995) | Drama |
43 | 44 | Mortal Kombat (1995) | Action|Adventure |
44 | 45 | To Die For (1995) | Comedy|Drama |
45 | 46 | How to Make an American Quilt (1995) | Drama|Romance |
46 | 47 | Seven (Se7en) (1995) | Crime|Thriller |
47 | 48 | Pocahontas (1995) | Animation|Children's|Musical|Romance |
48 | 49 | When Night Is Falling (1995) | Drama|Romance |
49 | 50 | Usual Suspects, The (1995) | Crime|Thriller |
50 | 51 | Guardian Angel (1994) | Action|Drama|Thriller |
51 | 52 | Mighty Aphrodite (1995) | Comedy |
52 | 53 | Lamerica (1994) | Drama |
53 | 54 | Big Green, The (1995) | Children's|Comedy |
54 | 55 | Georgia (1995) | Drama |
55 | 56 | Kids of the Round Table (1995) | Adventure|Children's|Fantasy |
56 | 57 | Home for the Holidays (1995) | Drama |
57 | 58 | Postino, Il (The Postman) (1994) | Drama|Romance |
58 | 59 | Confessional, The (Le Confessionnal) (1995) | Drama|Mystery |
59 | 60 | Indian in the Cupboard, The (1995) | Adventure|Children's|Fantasy |
60 | 61 | Eye for an Eye (1996) | Drama|Thriller |
61 | 62 | Mr. Holland's Opus (1995) | Drama |
62 | 63 | Don't Be a Menace to South Central While Drink... | Comedy |
63 | 64 | Two if by Sea (1996) | Comedy|Romance |
64 | 65 | Bio-Dome (1996) | Comedy |
65 | 66 | Lawnmower Man 2: Beyond Cyberspace (1996) | Sci-Fi|Thriller |
66 | 67 | Two Bits (1995) | Drama |
67 | 68 | French Twist (Gazon maudit) (1995) | Comedy|Romance |
68 | 69 | Friday (1995) | Comedy |
69 | 70 | From Dusk Till Dawn (1996) | Action|Comedy|Crime|Horror|Thriller |
70 | 71 | Fair Game (1995) | Action |
71 | 72 | Kicking and Screaming (1995) | Comedy|Drama |
72 | 73 | Misérables, Les (1995) | Drama|Musical |
73 | 74 | Bed of Roses (1996) | Drama|Romance |
74 | 75 | Big Bully (1996) | Comedy|Drama |
75 | 76 | Screamers (1995) | Sci-Fi|Thriller |
76 | 77 | Nico Icon (1995) | Documentary |
77 | 78 | Crossing Guard, The (1995) | Drama |
78 | 79 | Juror, The (1996) | Drama|Thriller |
79 | 80 | White Balloon, The (Badkonake Sefid ) (1995) | Drama |
80 | 81 | Things to Do in Denver when You're Dead (1995) | Crime|Drama|Romance |
81 | 82 | Antonia's Line (Antonia) (1995) | Drama |
82 | 83 | Once Upon a Time... When We Were Colored (1995) | Drama |
83 | 84 | Last Summer in the Hamptons (1995) | Comedy|Drama |
84 | 85 | Angels and Insects (1995) | Drama|Romance |
85 | 86 | White Squall (1996) | Adventure|Drama |
86 | 87 | Dunston Checks In (1996) | Children's|Comedy |
87 | 88 | Black Sheep (1996) | Comedy |
88 | 89 | Nick of Time (1995) | Action|Thriller |
89 | 90 | Journey of August King, The (1995) | Drama |
90 | 92 | Mary Reilly (1996) | Drama|Thriller |
91 | 93 | Vampire in Brooklyn (1995) | Comedy|Romance |
92 | 94 | Beautiful Girls (1996) | Drama |
93 | 95 | Broken Arrow (1996) | Action|Thriller |
94 | 96 | In the Bleak Midwinter (1995) | Comedy |
95 | 97 | Hate (Haine, La) (1995) | Drama |
96 | 98 | Shopping (1994) | Action|Thriller |
97 | 99 | Heidi Fleiss: Hollywood Madam (1995) | Documentary |
98 | 100 | City Hall (1996) | Drama|Thriller |
99 | 101 | Bottle Rocket (1996) | Comedy |
hmmm...age 1? Where to learn about occupation types? We have zip data...so it'd be fun to map. Might be useful to look at distribution of age, gender, and zip.
# confirm type of users as DataFram and list number of users
type(users), len(users)
(pandas.core.frame.DataFrame, 6040)
users.index, users.columns
(Int64Index([0, 1, 2, ..., 6037, 6038, 6039], dtype=int64), Index([user_id, gender, age, occupation, zip], dtype=object))
# row 0
users.ix[0]
user_id 1 gender F age 1 occupation 10 zip 48067 Name: 0
# a specific "cell" of users
users.ix[0]['occupation']
10
# how to get stats for a column?
users['gender'].describe()
count 6040 unique 2 top M freq 4331
from collections import Counter
Counter(users['gender'])
Counter({'M': 4331, 'F': 1709})
users['age'].describe()
count 6040.000000 mean 30.639238 std 12.895962 min 1.000000 25% 25.000000 50% 25.000000 75% 35.000000 max 56.000000
users['age'].hist()
<matplotlib.axes.AxesSubplot at 0xae00cf0>
ratings['rating'].hist()
<matplotlib.axes.AxesSubplot at 0xae040d0>
ratings['timestamp'].hist()
<matplotlib.axes.AxesSubplot at 0xae32330>
how to convert to datetime.datetime and plot hist in those units? come back to this task
import datetime
datetime.datetime.fromtimestamp(ratings['timestamp'][0])
datetime.datetime(2000, 12, 31, 14, 12, 40)
ratings['timestamp'].order().apply(datetime.datetime.fromtimestamp) #apply(datetime.datetime.fromtimestamp)
1000138 2000-04-25 16:05:32 999873 2000-04-25 16:05:54 1000153 2000-04-25 16:05:54 1000007 2000-04-25 16:06:17 1000192 2000-04-25 16:06:17 999868 2000-04-25 16:07:36 999877 2000-04-25 16:07:36 999920 2000-04-25 16:07:36 999967 2000-04-25 16:07:36 999980 2000-04-25 16:07:36 1000155 2000-04-25 16:08:01 999884 2000-04-25 16:09:51 999888 2000-04-25 16:09:51 999971 2000-04-25 16:09:51 1000186 2000-04-25 16:10:19 ... 984733 2003-02-27 10:17:38 609109 2003-02-27 14:00:12 59497 2003-02-27 15:31:15 511804 2003-02-27 16:51:39 984025 2003-02-28 05:11:19 983967 2003-02-28 05:12:12 879288 2003-02-28 07:05:11 825497 2003-02-28 09:44:20 825877 2003-02-28 09:44:42 825526 2003-02-28 09:45:20 825793 2003-02-28 09:45:38 825438 2003-02-28 09:47:23 825724 2003-02-28 09:49:08 825731 2003-02-28 09:49:08 825603 2003-02-28 09:49:50 Name: timestamp, Length: 1000209
data = pd.merge(pd.merge(ratings,users), movies)
data
<class 'pandas.core.frame.DataFrame'> Int64Index: 1000209 entries, 0 to 1000208 Data columns: user_id 1000209 non-null values movie_id 1000209 non-null values rating 1000209 non-null values timestamp 1000209 non-null values gender 1000209 non-null values age 1000209 non-null values occupation 1000209 non-null values zip 1000209 non-null values title 1000209 non-null values genres 1000209 non-null values dtypes: int64(6), object(4)
data[:5]
user_id | movie_id | rating | timestamp | gender | age | occupation | zip | title | genres | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1193 | 5 | 978300760 | F | 1 | 10 | 48067 | One Flew Over the Cuckoo's Nest (1975) | Drama |
1 | 2 | 1193 | 5 | 978298413 | M | 56 | 16 | 70072 | One Flew Over the Cuckoo's Nest (1975) | Drama |
2 | 12 | 1193 | 4 | 978220179 | M | 25 | 12 | 32793 | One Flew Over the Cuckoo's Nest (1975) | Drama |
3 | 15 | 1193 | 4 | 978199279 | M | 25 | 7 | 22903 | One Flew Over the Cuckoo's Nest (1975) | Drama |
4 | 17 | 1193 | 5 | 978158471 | M | 50 | 1 | 95350 | One Flew Over the Cuckoo's Nest (1975) | Drama |
http://www.grouplens.org/system/files/ml-1m-README.txt has the key for occupations and age ranges:
from collections import Counter
Counter(data['occupation'])
Counter({4: 131032, 0: 130499, 7: 105425, 1: 85351, 17: 72816, 20: 60397, 12: 57214, 2: 50068, 14: 49109, 16: 46021, 6: 37205, 3: 31623, 10: 23290, 15: 22951, 5: 21850, 11: 20563, 19: 14904, 13: 13754, 18: 12086, 9: 11345, 8: 2706})
data.occupation.groupby(data.occupation).size()
occupation 0 130499 1 85351 2 50068 3 31623 4 131032 5 21850 6 37205 7 105425 8 2706 9 11345 10 23290 11 20563 12 57214 13 13754 14 49109 15 22951 16 46021 17 72816 18 12086 19 14904 20 60397
mean_ratings = data.pivot_table('rating', rows='title', cols='gender', aggfunc='mean')
mean_ratings
<class 'pandas.core.frame.DataFrame'> Index: 3706 entries, $1,000,000 Duck (1971) to eXistenZ (1999) Data columns: F 3481 non-null values M 3671 non-null values dtypes: float64(2)
mean_ratings[:5]
gender | F | M |
---|---|---|
title | ||
$1,000,000 Duck (1971) | 3.375000 | 2.761905 |
'Night Mother (1986) | 3.388889 | 3.352941 |
'Til There Was You (1997) | 2.675676 | 2.733333 |
'burbs, The (1989) | 2.793478 | 2.962085 |
...And Justice for All (1979) | 3.828571 | 3.689024 |
# trying to pull together both male and female -- works
data.pivot_table(values='rating', rows='title')
title $1,000,000 Duck (1971) 3.027027 'Night Mother (1986) 3.371429 'Til There Was You (1997) 2.692308 'burbs, The (1989) 2.910891 ...And Justice for All (1979) 3.713568 1-900 (1994) 2.500000 10 Things I Hate About You (1999) 3.422857 101 Dalmatians (1961) 3.596460 101 Dalmatians (1996) 3.046703 12 Angry Men (1957) 4.295455 13th Warrior, The (1999) 3.158667 187 (1997) 2.745455 2 Days in the Valley (1996) 3.283217 20 Dates (1998) 2.856115 20,000 Leagues Under the Sea (1954) 3.702609 ... You've Got Mail (1998) 3.380668 Young Doctors in Love (1982) 2.607595 Young Frankenstein (1974) 4.250629 Young Guns (1988) 3.418149 Young Guns II (1990) 2.907859 Young Poisoner's Handbook, The (1995) 3.632911 Young Sherlock Holmes (1985) 3.390501 Young and Innocent (1937) 3.300000 Your Friends and Neighbors (1998) 3.376147 Zachariah (1971) 3.500000 Zed & Two Noughts, A (1985) 3.413793 Zero Effect (1998) 3.750831 Zero Kelvin (Kjærlighetens kjøtere) (1995) 3.500000 Zeus and Roxanne (1997) 2.521739 eXistenZ (1999) 3.256098 Name: rating, Length: 3706
data.groupby('title').size()
title $1,000,000 Duck (1971) 37 'Night Mother (1986) 70 'Til There Was You (1997) 52 'burbs, The (1989) 303 ...And Justice for All (1979) 199 1-900 (1994) 2 10 Things I Hate About You (1999) 700 101 Dalmatians (1961) 565 101 Dalmatians (1996) 364 12 Angry Men (1957) 616 13th Warrior, The (1999) 750 187 (1997) 55 2 Days in the Valley (1996) 286 20 Dates (1998) 139 20,000 Leagues Under the Sea (1954) 575 ... You've Got Mail (1998) 838 Young Doctors in Love (1982) 79 Young Frankenstein (1974) 1193 Young Guns (1988) 562 Young Guns II (1990) 369 Young Poisoner's Handbook, The (1995) 79 Young Sherlock Holmes (1985) 379 Young and Innocent (1937) 10 Your Friends and Neighbors (1998) 109 Zachariah (1971) 2 Zed & Two Noughts, A (1985) 29 Zero Effect (1998) 301 Zero Kelvin (Kjærlighetens kjøtere) (1995) 2 Zeus and Roxanne (1997) 23 eXistenZ (1999) 410 Length: 3706
import codecs
from itertools import islice
fname = os.path.join(MOVIELENS_DIR, "movies.dat")
f = codecs.open(fname, encoding='iso-8859-1')
for line in islice(f,100):
print line
1::Toy Story (1995)::Animation|Children's|Comedy 2::Jumanji (1995)::Adventure|Children's|Fantasy 3::Grumpier Old Men (1995)::Comedy|Romance 4::Waiting to Exhale (1995)::Comedy|Drama 5::Father of the Bride Part II (1995)::Comedy 6::Heat (1995)::Action|Crime|Thriller 7::Sabrina (1995)::Comedy|Romance 8::Tom and Huck (1995)::Adventure|Children's 9::Sudden Death (1995)::Action 10::GoldenEye (1995)::Action|Adventure|Thriller 11::American President, The (1995)::Comedy|Drama|Romance 12::Dracula: Dead and Loving It (1995)::Comedy|Horror 13::Balto (1995)::Animation|Children's 14::Nixon (1995)::Drama 15::Cutthroat Island (1995)::Action|Adventure|Romance 16::Casino (1995)::Drama|Thriller 17::Sense and Sensibility (1995)::Drama|Romance 18::Four Rooms (1995)::Thriller 19::Ace Ventura: When Nature Calls (1995)::Comedy 20::Money Train (1995)::Action 21::Get Shorty (1995)::Action|Comedy|Drama 22::Copycat (1995)::Crime|Drama|Thriller 23::Assassins (1995)::Thriller 24::Powder (1995)::Drama|Sci-Fi 25::Leaving Las Vegas (1995)::Drama|Romance 26::Othello (1995)::Drama 27::Now and Then (1995)::Drama 28::Persuasion (1995)::Romance 29::City of Lost Children, The (1995)::Adventure|Sci-Fi 30::Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)::Drama 31::Dangerous Minds (1995)::Drama 32::Twelve Monkeys (1995)::Drama|Sci-Fi 33::Wings of Courage (1995)::Adventure|Romance 34::Babe (1995)::Children's|Comedy|Drama 35::Carrington (1995)::Drama|Romance 36::Dead Man Walking (1995)::Drama 37::Across the Sea of Time (1995)::Documentary 38::It Takes Two (1995)::Comedy 39::Clueless (1995)::Comedy|Romance 40::Cry, the Beloved Country (1995)::Drama 41::Richard III (1995)::Drama|War 42::Dead Presidents (1995)::Action|Crime|Drama 43::Restoration (1995)::Drama 44::Mortal Kombat (1995)::Action|Adventure 45::To Die For (1995)::Comedy|Drama 46::How to Make an American Quilt (1995)::Drama|Romance 47::Seven (Se7en) (1995)::Crime|Thriller 48::Pocahontas (1995)::Animation|Children's|Musical|Romance 49::When Night Is Falling (1995)::Drama|Romance 50::Usual Suspects, The (1995)::Crime|Thriller 51::Guardian Angel (1994)::Action|Drama|Thriller 52::Mighty Aphrodite (1995)::Comedy 53::Lamerica (1994)::Drama 54::Big Green, The (1995)::Children's|Comedy 55::Georgia (1995)::Drama 56::Kids of the Round Table (1995)::Adventure|Children's|Fantasy 57::Home for the Holidays (1995)::Drama 58::Postino, Il (The Postman) (1994)::Drama|Romance 59::Confessional, The (Le Confessionnal) (1995)::Drama|Mystery 60::Indian in the Cupboard, The (1995)::Adventure|Children's|Fantasy 61::Eye for an Eye (1996)::Drama|Thriller 62::Mr. Holland's Opus (1995)::Drama 63::Don't Be a Menace to South Central While Drinking Your Juice in the Hood (1996)::Comedy 64::Two if by Sea (1996)::Comedy|Romance 65::Bio-Dome (1996)::Comedy 66::Lawnmower Man 2: Beyond Cyberspace (1996)::Sci-Fi|Thriller 67::Two Bits (1995)::Drama 68::French Twist (Gazon maudit) (1995)::Comedy|Romance 69::Friday (1995)::Comedy 70::From Dusk Till Dawn (1996)::Action|Comedy|Crime|Horror|Thriller 71::Fair Game (1995)::Action 72::Kicking and Screaming (1995)::Comedy|Drama 73::Misérables, Les (1995)::Drama|Musical 74::Bed of Roses (1996)::Drama|Romance 75::Big Bully (1996)::Comedy|Drama 76::Screamers (1995)::Sci-Fi|Thriller 77::Nico Icon (1995)::Documentary 78::Crossing Guard, The (1995)::Drama 79::Juror, The (1996)::Drama|Thriller 80::White Balloon, The (Badkonake Sefid ) (1995)::Drama 81::Things to Do in Denver when You're Dead (1995)::Crime|Drama|Romance 82::Antonia's Line (Antonia) (1995)::Drama 83::Once Upon a Time... When We Were Colored (1995)::Drama 84::Last Summer in the Hamptons (1995)::Comedy|Drama 85::Angels and Insects (1995)::Drama|Romance 86::White Squall (1996)::Adventure|Drama 87::Dunston Checks In (1996)::Children's|Comedy 88::Black Sheep (1996)::Comedy 89::Nick of Time (1995)::Action|Thriller 90::Journey of August King, The (1995)::Drama 92::Mary Reilly (1996)::Drama|Thriller 93::Vampire in Brooklyn (1995)::Comedy|Romance 94::Beautiful Girls (1996)::Drama 95::Broken Arrow (1996)::Action|Thriller 96::In the Bleak Midwinter (1995)::Comedy 97::Hate (Haine, La) (1995)::Drama 98::Shopping (1994)::Action|Thriller 99::Heidi Fleiss: Hollywood Madam (1995)::Documentary 100::City Hall (1996)::Drama|Thriller 101::Bottle Rocket (1996)::Comedy
import pandas as pd
import codecs
movies_file = codecs.open(os.path.join(MOVIELENS_DIR, 'movies.dat'), encoding='iso-8859-1')
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table(movies_file, sep='::', header=None,
names=mnames)
print (movies.ix[72]['title'] == u'Misérables, Les (1995)')
True
ratings_by_title = data.groupby('title').size()
ratings_by_title.order(ascending=False).plot()
<matplotlib.axes.AxesSubplot at 0x91dfb90>
ratings_by_title >= 250
title $1,000,000 Duck (1971) False 'Night Mother (1986) False 'Til There Was You (1997) False 'burbs, The (1989) True ...And Justice for All (1979) False 1-900 (1994) False 10 Things I Hate About You (1999) True 101 Dalmatians (1961) True 101 Dalmatians (1996) True 12 Angry Men (1957) True 13th Warrior, The (1999) True 187 (1997) False 2 Days in the Valley (1996) True 20 Dates (1998) False 20,000 Leagues Under the Sea (1954) True ... You've Got Mail (1998) True Young Doctors in Love (1982) False Young Frankenstein (1974) True Young Guns (1988) True Young Guns II (1990) True Young Poisoner's Handbook, The (1995) False Young Sherlock Holmes (1985) True Young and Innocent (1937) False Your Friends and Neighbors (1998) False Zachariah (1971) False Zed & Two Noughts, A (1985) False Zero Effect (1998) True Zero Kelvin (Kjærlighetens kjøtere) (1995) False Zeus and Roxanne (1997) False eXistenZ (1999) True Length: 3706
active_titles = ratings_by_title.index[ratings_by_title >= 250]
active_titles
Index(['burbs, The (1989), 10 Things I Hate About You (1999), 101 Dalmatians (1961), ..., Young Sherlock Holmes (1985), Zero Effect (1998), eXistenZ (1999)], dtype=object)
len(active_titles)
1216
# now subset
mean_ratings = mean_ratings.ix[active_titles]
mean_ratings
<class 'pandas.core.frame.DataFrame'> Index: 1216 entries, 'burbs, The (1989) to eXistenZ (1999) Data columns: F 1216 non-null values M 1216 non-null values dtypes: float64(2)
# top films among female viewers
mean_ratings.sort_index(by='F', ascending=False)[:10]
gender | F | M |
---|---|---|
title | ||
Close Shave, A (1995) | 4.644444 | 4.473795 |
Wrong Trousers, The (1993) | 4.588235 | 4.478261 |
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950) | 4.572650 | 4.464589 |
Wallace & Gromit: The Best of Aardman Animation (1996) | 4.563107 | 4.385075 |
Schindler's List (1993) | 4.562602 | 4.491415 |
Shawshank Redemption, The (1994) | 4.539075 | 4.560625 |
Grand Day Out, A (1992) | 4.537879 | 4.293255 |
To Kill a Mockingbird (1962) | 4.536667 | 4.372611 |
Creature Comforts (1990) | 4.513889 | 4.272277 |
Usual Suspects, The (1995) | 4.513317 | 4.518248 |
import pandas as pd
import codecs
names1880_file = codecs.open(os.path.join(NAMES_DIR,'yob2010.txt'), encoding='iso-8859-1')
names1880 = pd.read_csv(names1880_file, names=['name', 'sex', 'births'])
names1880
<class 'pandas.core.frame.DataFrame'> Int64Index: 33838 entries, 0 to 33837 Data columns: name 33838 non-null values sex 33838 non-null values births 33838 non-null values dtypes: int64(1), object(2)
# sort by name
names1880.sort('births', ascending=False)[:10]
name | sex | births | |
---|---|---|---|
0 | Isabella | F | 22731 |
19698 | Jacob | M | 21875 |
1 | Sophia | F | 20477 |
19699 | Ethan | M | 17866 |
2 | Emma | F | 17179 |
19700 | Michael | M | 17133 |
19701 | Jayden | M | 17030 |
19702 | William | M | 16870 |
3 | Olivia | F | 16860 |
19703 | Alexander | M | 16634 |
names1880[names1880.sex == 'F'].sort('births', ascending=False)[:10]
name | sex | births | |
---|---|---|---|
0 | Isabella | F | 22731 |
1 | Sophia | F | 20477 |
2 | Emma | F | 17179 |
3 | Olivia | F | 16860 |
4 | Ava | F | 15300 |
5 | Emily | F | 14172 |
6 | Abigail | F | 14124 |
7 | Madison | F | 13070 |
8 | Chloe | F | 11656 |
9 | Mia | F | 10541 |
names1880['births'].plot()
<matplotlib.axes.AxesSubplot at 0x64ae310>
names1880['births'].order(ascending=False).plot()
<matplotlib.axes.AxesSubplot at 0x64af430>
names1880['births'].order(ascending=False).cumsum().plot()
<matplotlib.axes.AxesSubplot at 0x643ad10>
names1880['births'].count()
33838
names1880.groupby('sex').births.sum()
sex F 1759010 M 1898382 Name: births
# which names that are in both sexes?
# let me do it with Python sets
names1880[names1880.sex == 'M'], names1880[names1880.sex == 'F']
(<class 'pandas.core.frame.DataFrame'> Int64Index: 14140 entries, 19698 to 33837 Data columns: name 14140 non-null values sex 14140 non-null values births 14140 non-null values dtypes: int64(1), object(2), <class 'pandas.core.frame.DataFrame'> Int64Index: 19698 entries, 0 to 19697 Data columns: name 19698 non-null values sex 19698 non-null values births 19698 non-null values dtypes: int64(1), object(2))
m_names = set(names1880[names1880.sex == 'M']['name'])
f_names = set(names1880[names1880.sex == 'F']['name'])
len(m_names & f_names)
2406
# almost...
try:
k = names1880.groupby('name').ix(names1880.groupby('name').size() == 2)
except Exception as e:
print e
else:
print k
'Series' object is not callable
count_sex = names1880.groupby('name').size()
mf_names = count_sex.index[count_sex == 2]
names1880.ix[mf_names]
<class 'pandas.core.frame.DataFrame'> Index: 2406 entries, Aaliyah to Zyon Data columns: name 0 non-null values sex 0 non-null values births 0 non-null values dtypes: float64(1), object(2)
from itertools import islice
mf_names =(k for (k, g) in names1880.groupby('name') if len(g) == 2)
list(islice(mf_names, 5))
['Aaliyah', 'Aamari', 'Aaren', 'Aarin', 'Aaron']
names1880.groupby('name')
<pandas.core.groupby.DataFrameGroupBy at 0x63ba330>
names1880.groupby('name').size().values
array([1, 1, 1, ..., 1, 1, 1], dtype=int64)
!ls $NAMES_DIR
NationalReadMe.pdf yob1890.txt yob1901.txt yob1912.txt yob1923.txt yob1934.txt yob1945.txt yob1956.txt yob1967.txt yob1978.txt yob1989.txt yob2000.txt yob1880.txt yob1891.txt yob1902.txt yob1913.txt yob1924.txt yob1935.txt yob1946.txt yob1957.txt yob1968.txt yob1979.txt yob1990.txt yob2001.txt yob1881.txt yob1892.txt yob1903.txt yob1914.txt yob1925.txt yob1936.txt yob1947.txt yob1958.txt yob1969.txt yob1980.txt yob1991.txt yob2002.txt yob1882.txt yob1893.txt yob1904.txt yob1915.txt yob1926.txt yob1937.txt yob1948.txt yob1959.txt yob1970.txt yob1981.txt yob1992.txt yob2003.txt yob1883.txt yob1894.txt yob1905.txt yob1916.txt yob1927.txt yob1938.txt yob1949.txt yob1960.txt yob1971.txt yob1982.txt yob1993.txt yob2004.txt yob1884.txt yob1895.txt yob1906.txt yob1917.txt yob1928.txt yob1939.txt yob1950.txt yob1961.txt yob1972.txt yob1983.txt yob1994.txt yob2005.txt yob1885.txt yob1896.txt yob1907.txt yob1918.txt yob1929.txt yob1940.txt yob1951.txt yob1962.txt yob1973.txt yob1984.txt yob1995.txt yob2006.txt yob1886.txt yob1897.txt yob1908.txt yob1919.txt yob1930.txt yob1941.txt yob1952.txt yob1963.txt yob1974.txt yob1985.txt yob1996.txt yob2007.txt yob1887.txt yob1898.txt yob1909.txt yob1920.txt yob1931.txt yob1942.txt yob1953.txt yob1964.txt yob1975.txt yob1986.txt yob1997.txt yob2008.txt yob1888.txt yob1899.txt yob1910.txt yob1921.txt yob1932.txt yob1943.txt yob1954.txt yob1965.txt yob1976.txt yob1987.txt yob1998.txt yob2009.txt yob1889.txt yob1900.txt yob1911.txt yob1922.txt yob1933.txt yob1944.txt yob1955.txt yob1966.txt yob1977.txt yob1988.txt yob1999.txt yob2010.txt
# 2010 is the last available year right now
import os
years = range(1880, 2011)
pieces = []
columns = ['name', 'sex', 'births']
for year in years:
path = os.path.join(NAMES_DIR, 'yob%d.txt' % year)
frame = pd.read_csv(path, names=columns)
frame['year'] = year
pieces.append(frame)
# Concatenate everything into a single DataFrame
names = pd.concat(pieces, ignore_index=True)
names
<class 'pandas.core.frame.DataFrame'> Int64Index: 1690784 entries, 0 to 1690783 Data columns: name 1690784 non-null values sex 1690784 non-null values births 1690784 non-null values year 1690784 non-null values dtypes: int64(2), object(2)
total_births = names.pivot_table('births', rows='year', cols='sex', aggfunc=sum)
total_births[:5]
sex | F | M |
---|---|---|
year | ||
1880 | 90993 | 110493 |
1881 | 91955 | 100748 |
1882 | 107851 | 113687 |
1883 | 112322 | 104632 |
1884 | 129021 | 114445 |
# how to calculate the total births / year
names.groupby('year').sum().plot(title="total births by year")
<matplotlib.axes.AxesSubplot at 0x63c3c50>
# add prop
def add_prop(group):
# Integer division floors
births = group.births.astype(float)
group['prop'] = births / births.sum()
return group
names = names.groupby(['year', 'sex']).apply(add_prop)
# verify prop
np.allclose(names.groupby(['year', 'sex']).prop.sum(), 1)
True
# proportion for Raymond
names.name
0 Mary 1 Anna 2 Emma 3 Elizabeth 4 Minnie 5 Margaret 6 Ida 7 Alice 8 Bertha 9 Sarah 10 Annie 11 Clara 12 Ella 13 Florence 14 Cora ... 1690769 Zishan 1690770 Ziven 1690771 Zmari 1690772 Zoren 1690773 Zuhaib 1690774 Zyeire 1690775 Zygmunt 1690776 Zykerion 1690777 Zylar 1690778 Zylin 1690779 Zymaire 1690780 Zyonne 1690781 Zyquarius 1690782 Zyran 1690783 Zzyzx Name: name, Length: 1690784
# max / min totals and when? -- awkward -- must be a better way
total_births_sum = names.groupby('year').sum()
max_value = list(total_births_sum.max())[0]
min_value = list(total_births_sum.min())[0]
is_max = total_births_sum.births == max_value
is_min = total_births_sum.births == min_value
is_max[is_max], is_min[is_min]
(year 1957 True Name: births, year 1881 True Name: births)
total_births_sum.diff().plot()
<matplotlib.axes.AxesSubplot at 0x18a59890>
total_births.sum().sum()
322402727.0
names[(names.name=='Raymond') & (names.sex=='M')][['year', 'births']].set_index(keys='year').plot(title="Raymond")
<matplotlib.axes.AxesSubplot at 0x18a767d0>
# plot multiple names on same plot or as multiple axes
def name_sex_count_in_year(name,sex):
return names[(names.name==name) & (names.sex==sex)][['year', 'births']].set_index(keys='year')
def name_sex_prop_in_year(name,sex):
return names[(names.name==name) & (names.sex==sex)][['year', 'prop']].set_index(keys='year')
name_df = DataFrame(index=arange(1880,2010))
name_df['Raymond'] = name_sex_count_in_year('Raymond','M')
name_df['Laura'] = name_sex_count_in_year('Laura','F')
name_df.plot()
<matplotlib.axes.AxesSubplot at 0xbdd2810>
name_df = DataFrame(index=arange(1880,2010))
name_df['Raymond'] = name_sex_prop_in_year('Raymond','M')
name_df['Laura'] = name_sex_prop_in_year('Laura','F')
name_df.plot()
<matplotlib.axes.AxesSubplot at 0xe103570>
total_births[:5]
sex | F | M |
---|---|---|
year | ||
1880 | 90993 | 110493 |
1881 | 91955 | 100748 |
1882 | 107851 | 113687 |
1883 | 112322 | 104632 |
1884 | 129021 | 114445 |
total_births.plot(title='Total births by sex and year')
<matplotlib.axes.AxesSubplot at 0x17e0abd0>
# http://en.wikipedia.org/wiki/Human_sex_ratio
# make an agg figure
fig = figure()
# meaning of 111: http://stackoverflow.com/a/3584933/7782
ax = fig.add_subplot(111)
ax.set_title('Ratio of M to F births')
cum_ratio_by_sex = total_births.M.cumsum() / total_births.F.cumsum()
cum_ratio_by_sex.plot(ax=ax, label="cumulative", color="red")
# add instantaneous ratio
annual_ratio_by_sex = total_births.M / total_births.F
annual_ratio_by_sex.plot(ax=ax, label="annual", color="green")
ax.legend(loc='best')
fig.canvas.draw()
# number of names over time
names.groupby('year').count()[['name']].plot()
<matplotlib.axes.AxesSubplot at 0xef5a070>
def get_top1000(group):
return group.sort_index(by='births', ascending=False)[:1000]
grouped = names.groupby(['year', 'sex'])
top1000 = grouped.apply(get_top1000)
fig = figure()
# meaning of 111: http://stackoverflow.com/a/3584933/7782
ax = fig.add_subplot(111)
ax.set_title('Entropy of names')
S_male = names[names.sex=='M'].groupby('year').prop.agg(lambda x: sum([-j*log(j) for j in x])) # apply(lambda x: -x*log(x))
S_male.plot(ax=ax, label="M", color="blue")
S_female = names[names.sex=='F'].groupby('year').prop.agg(lambda x: sum([-j*log(j) for j in x])) # apply(lambda x: -x*log(x))
S_female.plot(ax=ax, label="F", color="red")
ax.legend(loc='best')
ax.set_ylim(0)
fig.canvas.draw()
names[names.sex=='M'].groupby('name').sum().births.order()[::-1]
name James 5049727 John 5040319 Robert 4767995 Michael 4243903 William 3986586 David 3525961 Richard 2542735 Joseph 2519280 Charles 2335387 Thomas 2254734 Christopher 1947979 Daniel 1810130 Matthew 1498203 George 1443869 Donald 1405550 ... Aaryav 5 Aarren 5 Aaronlee 5 Aaronjosh 5 Aaronjacob 5 Aaraon 5 Aanand 5 Aamil 5 Aamar 5 Aalon 5 Aalok 5 Aalias 5 Aahron 5 Aadhavan 5 Aabid 5 Name: births, Length: 36742
male_names = names[names.sex=='M'].groupby('name').sum().index
female_names = names[names.sex=='F'].groupby('name').sum().index
len(male_names + female_names)
88496
len(np.unique(names.name.values))
88496
# average length of names in general
names.name.str.len().mean()
6.1730889338910231
# average length of name by year
names['name_len'] = names.name.str.len()
names.groupby('year').name_len.mean().plot()
del names['name_len']
fig = figure()
# meaning of 111: http://stackoverflow.com/a/3584933/7782
ax = fig.add_subplot(111)
ax.set_title('average length of name')
names['name_len'] = names.name.str.len()
names[names.sex=='M'].groupby('year').name_len.mean().plot(ax=ax, label="M", color="blue")
names[names.sex=='F'].groupby('year').name_len.mean().plot(ax=ax, label="F", color="red")
#del names['name_len']
ax.legend(loc='best')
fig.canvas.draw()
# calculate number of M, F, M/F names in aggregate and then over time
names[names.year==2010].groupby(['name', 'year']).sex.count().value_counts()
1 29026 2 2406