In [3]:

%pylab inline

Populating the interactive namespace from numpy and matplotlib

1.usa.gov data from bit.ly¶

A feed of anonymous data gathered from users who shorten links ending with .gov and .mil.

In [6]:

path = '/Users/sergulaydore/pydata-book/ch02/usagov_bitly_data2012-03-16-1331923249.txt'

In [7]:

open(path).readline() # each line containsa common form of web data known as JSON (JavaScript Object Notation)

Out[7]:

'{ "a": "Mozilla\\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\\/535.11 (KHTML, like Gecko) Chrome\\/17.0.963.78 Safari\\/535.11", "c": "US", "nk": 1, "tz": "America\\/New_York", "gr": "MA", "g": "A6qOVH", "h": "wfLQtf", "l": "orofrog", "al": "en-US,en;q=0.8", "hh": "1.usa.gov", "r": "http:\\/\\/www.facebook.com\\/l\\/7AQEFzjSi\\/1.usa.gov\\/wfLQtf", "u": "http:\\/\\/www.ncbi.nlm.nih.gov\\/pubmed\\/22415991", "t": 1331923247, "hc": 1331822918, "cy": "Danvers", "ll": [ 42.576698, -70.954903 ] }\n'

importing a JSON string into a Python dictionary¶

In [8]:

import json
record = [json.loads(line) for line in open(path)]

In [9]:

record[0]

Out[9]:

{u'a': u'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.78 Safari/535.11',
 u'al': u'en-US,en;q=0.8',
 u'c': u'US',
 u'cy': u'Danvers',
 u'g': u'A6qOVH',
 u'gr': u'MA',
 u'h': u'wfLQtf',
 u'hc': 1331822918,
 u'hh': u'1.usa.gov',
 u'l': u'orofrog',
 u'll': [42.576698, -70.954903],
 u'nk': 1,
 u'r': u'http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/wfLQtf',
 u't': 1331923247,
 u'tz': u'America/New_York',
 u'u': u'http://www.ncbi.nlm.nih.gov/pubmed/22415991'}

The resulting object records is now a list of Python dicts.

In [11]:

record[0]['tz']

Out[11]:

u'America/New_York'

Counting Time Zones in Pure Python¶

In [13]:

time_zones = [rec['tz'] for rec in record if 'tz' in rec]

In [16]:

time_zones[:10]

Out[16]:

[u'America/New_York',
 u'America/Denver',
 u'America/New_York',
 u'America/Sao_Paulo',
 u'America/New_York',
 u'America/New_York',
 u'Europe/Warsaw',
 u'',
 u'',
 u'']

In [17]:

def get_counts(sequence):
    counts = {}
    for x in sequence:
        if x in counts: 
            counts[x] +=1
        else:
            counts[x] = 1
    return counts        

In [19]:

from collections import defaultdict

def get_counts2(sequence):
    counts = defaultdict(int) # values will initialize to 0
    for x in sequence:
        counts[x] +=1
    return counts    

In [20]:

counts = get_counts(time_zones)

In [21]:

counts['America/New_York']

Out[21]:

In [22]:

len(time_zones)

Out[22]:

If we wanted the top ten time zones and their counts

In [27]:

def top_counts(counts_dict,n=10):
    value_key_pairs = [(count, tz) for tz,count in counts_dict.items()]
    value_key_pairs.sort()
    return value_key_pairs[-n:]

In [28]:

top_counts(counts)

Out[28]:

[(33, u'America/Sao_Paulo'),
 (35, u'Europe/Madrid'),
 (36, u'Pacific/Honolulu'),
 (37, u'Asia/Tokyo'),
 (74, u'Europe/London'),
 (191, u'America/Denver'),
 (382, u'America/Los_Angeles'),
 (400, u'America/Chicago'),
 (521, u''),
 (1251, u'America/New_York')]

collections.Counter class makes this tak a lot easier.

In [29]:

from collections import Counter

In [30]:

counts = Counter(time_zones)

In [32]:

counts.most_common(10)

Out[32]:

[(u'America/New_York', 1251),
 (u'', 521),
 (u'America/Chicago', 400),
 (u'America/Los_Angeles', 382),
 (u'America/Denver', 191),
 (u'Europe/London', 74),
 (u'Asia/Tokyo', 37),
 (u'Pacific/Honolulu', 36),
 (u'Europe/Madrid', 35),
 (u'America/Sao_Paulo', 33)]

Counting Time Zones with pandas¶

In [33]:

from pandas import DataFrame, Series

In [34]:

import pandas as pd; import numpy as np

In [36]:

frame = DataFrame(record)

In [42]:

frame.head()

Out[42]:

	_heartbeat_	a	al	c	cy	g	gr	h	hc	hh	kw	l	ll	nk	r	t	tz	u
0	NaN	Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...	en-US,en;q=0.8	US	Danvers	A6qOVH	MA	wfLQtf	1331822918	1.usa.gov	NaN	orofrog	[42.576698, -70.954903]	1	http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/...	1331923247	America/New_York	http://www.ncbi.nlm.nih.gov/pubmed/22415991
1	NaN	GoogleMaps/RochesterNY	NaN	US	Provo	mwszkS	UT	mwszkS	1308262393	j.mp	NaN	bitly	[40.218102, -111.613297]	0	http://www.AwareMap.com/	1331923249	America/Denver	http://www.monroecounty.gov/etc/911/rss.php
2	NaN	Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ...	en-US	US	Washington	xxr3Qb	DC	xxr3Qb	1331919941	1.usa.gov	NaN	bitly	[38.9007, -77.043098]	1	http://t.co/03elZC4Q	1331923250	America/New_York	http://boxer.senate.gov/en/press/releases/0316...
3	NaN	Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8)...	pt-br	BR	Braz	zCaLwp	27	zUtuOu	1331923068	1.usa.gov	NaN	alelex88	[-23.549999, -46.616699]	0	direct	1331923249	America/Sao_Paulo	http://apod.nasa.gov/apod/ap120312.html
4	NaN	Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...	en-US,en;q=0.8	US	Shrewsbury	9b6kNl	MA	9b6kNl	1273672411	bit.ly	NaN	bitly	[42.286499, -71.714699]	0	http://www.shrewsbury-ma.gov/selco/	1331923251	America/New_York	http://www.shrewsbury-ma.gov/egov/gallery/1341...

In [43]:

frame['tz'][:10]

Out[43]:

0     America/New_York
1       America/Denver
2     America/New_York
3    America/Sao_Paulo
4     America/New_York
5     America/New_York
6        Europe/Warsaw
7                     
8                     
9                     
Name: tz, dtype: object

In [44]:

tz_counts = frame['tz'].value_counts()

In [45]:

tz_counts[:10]

Out[45]:

America/New_York       1251
                        521
America/Chicago         400
America/Los_Angeles     382
America/Denver          191
Europe/London            74
Asia/Tokyo               37
Pacific/Honolulu         36
Europe/Madrid            35
America/Sao_Paulo        33
dtype: int64

In [46]:

clean_tz = frame['tz'].fillna('Missing')

In [47]:

clean_tz[clean_tz=='']='Unknown'

In [48]:

tz_counts = clean_tz.value_counts()

In [49]:

tz_counts[:10]

Out[49]:

America/New_York       1251
Unknown                 521
America/Chicago         400
America/Los_Angeles     382
America/Denver          191
Missing                 120
Europe/London            74
Asia/Tokyo               37
Pacific/Honolulu         36
Europe/Madrid            35
dtype: int64

In [50]:

tz_counts[:10].plot(kind='barh',rot=0)

Out[50]:

<matplotlib.axes._subplots.AxesSubplot at 0x1083abd90>

The a field contains information about the browser, device, or application used to perform the URL shortening:

In [51]:

frame['a'][1]

Out[51]:

u'GoogleMaps/RochesterNY'

In [52]:

frame['a'][50]

Out[52]:

u'Mozilla/5.0 (Windows NT 5.1; rv:10.0.2) Gecko/20100101 Firefox/10.0.2'

In [53]:

frame['a'][51]

Out[53]:

u'Mozilla/5.0 (Linux; U; Android 2.2.2; en-us; LG-P925/V10e Build/FRG83G) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1'

In [54]:

results = Series([x.split()[0] for x in frame.a.dropna()])

In [55]:

results[:5]

Out[55]:

0               Mozilla/5.0
1    GoogleMaps/RochesterNY
2               Mozilla/4.0
3               Mozilla/5.0
4               Mozilla/5.0
dtype: object

In [56]:

results.value_counts()[:8]

Out[56]:

Mozilla/5.0                 2594
Mozilla/4.0                  601
GoogleMaps/RochesterNY       121
Opera/9.80                    34
TEST_INTERNET_AGENT           24
GoogleProducer                21
Mozilla/6.0                    5
BlackBerry8520/5.0.0.681       4
dtype: int64

Now, suppose you wanted to decompose the top time zones into Windows and non-windows users.

In [57]:

cframe = frame[frame.a.notnull()]

In [58]:

operating_system = np.where(cframe['a'].str.contains('Windows'), 'Windows','Not Windows')

In [59]:

operating_system[:5]

Out[59]:

array(['Windows', 'Not Windows', 'Windows', 'Not Windows', 'Windows'], 
      dtype='|S11')

Then, you can group the data by its time zone column and this new list of operating systems:

In [60]:

by_tz_os = cframe.groupby(['tz', operating_system])

The group counts, analogous to the value counts function above, can be computed using size. This result is then reshaped into a table with unstack:

In [61]:

agg_counts = by_tz_os.size().unstack().fillna(0)

In [62]:

agg_counts[:10]

Out[62]:

	Not Windows	Windows
tz
	245	276
Africa/Cairo	0	3
Africa/Casablanca	0	1
Africa/Ceuta	0	2
Africa/Johannesburg	0	1
Africa/Lusaka	0	1
America/Anchorage	4	1
America/Argentina/Buenos_Aires	1	0
America/Argentina/Cordoba	0	1
America/Argentina/Mendoza	0	1

In [75]:

indexer = agg_counts.sum(1).argsort()

In [76]:

indexer[:10]

Out[76]:

tz
                                  24
Africa/Cairo                      20
Africa/Casablanca                 21
Africa/Ceuta                      92
Africa/Johannesburg               87
Africa/Lusaka                     53
America/Anchorage                 54
America/Argentina/Buenos_Aires    57
America/Argentina/Cordoba         26
America/Argentina/Mendoza         55
dtype: int64

I then use take to select the rows in that order, then slice off the last 10 rows.

In [77]:

count_subset = agg_counts.take(indexer)[-10:]

In [78]:

count_subset

Out[78]:

	Not Windows	Windows
tz
America/Sao_Paulo	13	20
Europe/Madrid	16	19
Pacific/Honolulu	0	36
Asia/Tokyo	2	35
Europe/London	43	31
America/Denver	132	59
America/Los_Angeles	130	252
America/Chicago	115	285
	245	276
America/New_York	339	912

In [79]:

count_subset.plot(kind = 'barh',stacked = True)

Out[79]:

<matplotlib.axes._subplots.AxesSubplot at 0x108d1ab10>

In [80]:

normed_subset = count_subset.div(count_subset.sum(1),axis=0)

In [81]:

normed_subset.plot(kind = 'barh', stacked = True)

Out[81]:

<matplotlib.axes._subplots.AxesSubplot at 0x108ec0c50>

MovieLens 1 M Data Set¶

In [83]:

path = '/Users/sergulaydore/pydata-book/ch02/movielens/'
unames = ['user_id', 'gender', 'age', 'occupation', 'zip']

In [84]:

users = pd.read_table(path + 'users.dat', sep = '::', header = None, names = unames)

/Users/sergulaydore/anaconda/lib/python2.7/site-packages/pandas/io/parsers.py:624: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators; you can avoid this warning by specifying engine='python'.
  ParserWarning)

In [85]:

rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table(path + 'ratings.dat', sep = '::', header = None, names = rnames)

In [86]:

mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table(path + 'movies.dat', sep='::', header = None, names = mnames)

In [87]:

users[:5]

Out[87]:

	user_id	gender	age	occupation	zip
0	1	F	1	10	48067
1	2	M	56	16	70072
2	3	M	25	15	55117
3	4	M	45	7	02460
4	5	M	25	20	55455

In [88]:

ratings[:5]

Out[88]:

	user_id	movie_id	rating	timestamp
0	1	1193	5	978300760
1	1	661	3	978302109
2	1	914	3	978301968
3	1	3408	4	978300275
4	1	2355	5	978824291

In [89]:

movies[:5]

Out[89]:

	movie_id	title	genres
0	1	Toy Story (1995)	Animation\|Children's\|Comedy
1	2	Jumanji (1995)	Adventure\|Children's\|Fantasy
2	3	Grumpier Old Men (1995)	Comedy\|Romance
3	4	Waiting to Exhale (1995)	Comedy\|Drama
4	5	Father of the Bride Part II (1995)	Comedy

In [90]:

data = pd.merge(pd.merge(ratings,users), movies)

In [92]:

data.head()

Out[92]:

	user_id	movie_id	rating	timestamp	gender	age	occupation	zip	title	genres
0	1	1193	5	978300760	F	1	10	48067	One Flew Over the Cuckoo's Nest (1975)	Drama
1	2	1193	5	978298413	M	56	16	70072	One Flew Over the Cuckoo's Nest (1975)	Drama
2	12	1193	4	978220179	M	25	12	32793	One Flew Over the Cuckoo's Nest (1975)	Drama
3	15	1193	4	978199279	M	25	7	22903	One Flew Over the Cuckoo's Nest (1975)	Drama
4	17	1193	5	978158471	M	50	1	95350	One Flew Over the Cuckoo's Nest (1975)	Drama

In [93]:

data.ix[0]

Out[93]:

user_id                                            1
movie_id                                        1193
rating                                             5
timestamp                                  978300760
gender                                             F
age                                                1
occupation                                        10
zip                                            48067
title         One Flew Over the Cuckoo's Nest (1975)
genres                                         Drama
Name: 0, dtype: object

To get mean movie ratings for each film grouped by gender, we can use the pivot_table method:

In [94]:

mean_ratings = data.pivot_table('rating',rows='title',cols='gender',aggfunc = 'mean') 

/Users/sergulaydore/anaconda/lib/python2.7/site-packages/pandas/util/decorators.py:53: FutureWarning: cols is deprecated, use columns instead
  warnings.warn(msg, FutureWarning)
/Users/sergulaydore/anaconda/lib/python2.7/site-packages/pandas/util/decorators.py:53: FutureWarning: rows is deprecated, use index instead
  warnings.warn(msg, FutureWarning)

In [95]:

mean_ratings[:5]

Out[95]:

gender	F	M
title
$1,000,000 Duck (1971)	3.375000	2.761905
'Night Mother (1986)	3.388889	3.352941
'Til There Was You (1997)	2.675676	2.733333
'burbs, The (1989)	2.793478	2.962085
...And Justice for All (1979)	3.828571	3.689024

In [100]:

ratings_by_title = data.groupby('title').size()

In [101]:

ratings_by_title[:10]

Out[101]:

title
$1,000,000 Duck (1971)                37
'Night Mother (1986)                  70
'Til There Was You (1997)             52
'burbs, The (1989)                   303
...And Justice for All (1979)        199
1-900 (1994)                           2
10 Things I Hate About You (1999)    700
101 Dalmatians (1961)                565
101 Dalmatians (1996)                364
12 Angry Men (1957)                  616
dtype: int64

In [102]:

active_titles = ratings_by_title.index[ratings_by_title >= 250]

In [103]:

active_titles

Out[103]:

Index([u''burbs, The (1989)', u'10 Things I Hate About You (1999)', u'101 Dalmatians (1961)', u'101 Dalmatians (1996)', u'12 Angry Men (1957)', u'13th Warrior, The (1999)', u'2 Days in the Valley (1996)', u'20,000 Leagues Under the Sea (1954)', u'2001: A Space Odyssey (1968)', u'2010 (1984)', u'28 Days (2000)', u'39 Steps, The (1935)', u'54 (1998)', u'7th Voyage of Sinbad, The (1958)', u'8MM (1999)', u'About Last Night... (1986)', u'Absent Minded Professor, The (1961)', u'Absolute Power (1997)', u'Abyss, The (1989)', u'Ace Ventura: Pet Detective (1994)', u'Ace Ventura: When Nature Calls (1995)', u'Addams Family Values (1993)', u'Addams Family, The (1991)', u'Adventures in Babysitting (1987)', u'Adventures of Buckaroo Bonzai Across the 8th Dimension, The (1984)', u'Adventures of Priscilla, Queen of the Desert, The (1994)', u'Adventures of Robin Hood, The (1938)', u'African Queen, The (1951)', u'Age of Innocence, The (1993)', u'Agnes of God (1985)', u'Air America (1990)', u'Air Force One (1997)', u'Airplane II: The Sequel (1982)', u'Airplane! (1980)', u'Akira (1988)', u'Aladdin (1992)', u'Alice in Wonderland (1951)', u'Alien (1979)', u'Alien Nation (1988)', u'Alien: Resurrection (1997)', u'Aliens (1986)', u'Alien� (1992)', u'Alive (1993)', u'All About Eve (1950)', u'All About My Mother (Todo Sobre Mi Madre) (1999)', u'All Quiet on the Western Front (1930)', u'All That Jazz (1979)', u'Almost Famous (2000)', u'Amadeus (1984)', u'American Beauty (1999)', u'American Gigolo (1980)', u'American Graffiti (1973)', u'American History X (1998)', u'American Movie (1999)', u'American Pie (1999)', u'American President, The (1995)', u'American Psycho (2000)', u'American Tail, An (1986)', u'American Werewolf in London, An (1981)', u'American Werewolf in Paris, An (1997)', u'American in Paris, An (1951)', u'Amistad (1997)', u'Amityville Horror, The (1979)', u'Anaconda (1997)', u'Analyze This (1999)', u'Anastasia (1997)', u'And Now for Something Completely Different (1971)', u'Angel Heart (1987)', u'Animal House (1978)', u'Anna and the King (1999)', u'Annie Hall (1977)', u'Antz (1998)', u'Any Given Sunday (1999)', u'Apartment, The (1960)', u'Apocalypse Now (1979)', u'Apollo 13 (1995)', u'Apostle, The (1997)', u'Arachnophobia (1990)', u'Aristocats, The (1970)', u'Arlington Road (1999)', u'Armageddon (1998)', u'Army of Darkness (1993)', u'Around the World in 80 Days (1956)', u'Arrival, The (1996)', u'Arsenic and Old Lace (1944)', u'Arthur (1981)', u'As Good As It Gets (1997)', u'Astronaut's Wife, The (1999)', u'Atlantic City (1980)', u'Auntie Mame (1958)', u'Austin Powers: International Man of Mystery (1997)', u'Austin Powers: The Spy Who Shagged Me (1999)', u'Avengers, The (1998)', u'Awakenings (1990)', u'Babe (1995)', u'Babe: Pig in the City (1998)', u'Bachelor Party (1984)', u'Bachelor, The (1999)', u'Back to School (1986)', u'Back to the Future (1985)', ...], dtype='object')

The index of titles receiving at least 250 ratings can then be used to select rows from mean_ratings above:

In [104]:

mean_ratings = mean_ratings.ix[active_titles]

In [106]:

mean_ratings.head()

Out[106]:

gender	F	M
title
'burbs, The (1989)	2.793478	2.962085
10 Things I Hate About You (1999)	3.646552	3.311966
101 Dalmatians (1961)	3.791444	3.500000
101 Dalmatians (1996)	3.240000	2.911215
12 Angry Men (1957)	4.184397	4.328421

To see the top films among female viewers, we can sort by the F column in descending order:

In [109]:

top_female_ratings = mean_ratings.sort_index(by = 'F', ascending = False)

In [110]:

top_female_ratings[:10]

Out[110]:

gender	F	M
title
Close Shave, A (1995)	4.644444	4.473795
Wrong Trousers, The (1993)	4.588235	4.478261
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)	4.572650	4.464589
Wallace & Gromit: The Best of Aardman Animation (1996)	4.563107	4.385075
Schindler's List (1993)	4.562602	4.491415
Shawshank Redemption, The (1994)	4.539075	4.560625
Grand Day Out, A (1992)	4.537879	4.293255
To Kill a Mockingbird (1962)	4.536667	4.372611
Creature Comforts (1990)	4.513889	4.272277
Usual Suspects, The (1995)	4.513317	4.518248

Measuring rating disagreement¶

In [111]:

mean_ratings['diff'] = mean_ratings['M']-mean_ratings['F']

In [112]:

sorted_by_diff = mean_ratings.sort_index(by='diff')

In [113]:

sorted_by_diff[:15]

Out[113]:

gender	F	M	diff
title
Dirty Dancing (1987)	3.790378	2.959596	-0.830782
Jumpin' Jack Flash (1986)	3.254717	2.578358	-0.676359
Grease (1978)	3.975265	3.367041	-0.608224
Little Women (1994)	3.870588	3.321739	-0.548849
Steel Magnolias (1989)	3.901734	3.365957	-0.535777
Anastasia (1997)	3.800000	3.281609	-0.518391
Rocky Horror Picture Show, The (1975)	3.673016	3.160131	-0.512885
Color Purple, The (1985)	4.158192	3.659341	-0.498851
Age of Innocence, The (1993)	3.827068	3.339506	-0.487561
Free Willy (1993)	2.921348	2.438776	-0.482573
French Kiss (1995)	3.535714	3.056962	-0.478752
Little Shop of Horrors, The (1960)	3.650000	3.179688	-0.470312
Guys and Dolls (1955)	4.051724	3.583333	-0.468391
Mary Poppins (1964)	4.197740	3.730594	-0.467147
Patch Adams (1998)	3.473282	3.008746	-0.464536

In [114]:

sorted_by_diff[::-1][:15]

Out[114]:

gender	F	M	diff
title
Good, The Bad and The Ugly, The (1966)	3.494949	4.221300	0.726351
Kentucky Fried Movie, The (1977)	2.878788	3.555147	0.676359
Dumb & Dumber (1994)	2.697987	3.336595	0.638608
Longest Day, The (1962)	3.411765	4.031447	0.619682
Cable Guy, The (1996)	2.250000	2.863787	0.613787
Evil Dead II (Dead By Dawn) (1987)	3.297297	3.909283	0.611985
Hidden, The (1987)	3.137931	3.745098	0.607167
Rocky III (1982)	2.361702	2.943503	0.581801
Caddyshack (1980)	3.396135	3.969737	0.573602
For a Few Dollars More (1965)	3.409091	3.953795	0.544704
Porky's (1981)	2.296875	2.836364	0.539489
Animal House (1978)	3.628906	4.167192	0.538286
Exorcist, The (1973)	3.537634	4.067239	0.529605
Fright Night (1985)	2.973684	3.500000	0.526316
Barb Wire (1996)	1.585366	2.100386	0.515020

In [116]:

rating_std_by_title = data.groupby('title')['rating'].std()

In [117]:

rating_std_by_title = rating_std_by_title.ix[active_titles]

In [118]:

rating_std_by_title.order(ascending=False)[:10]

Out[118]:

title
Dumb & Dumber (1994)                     1.321333
Blair Witch Project, The (1999)          1.316368
Natural Born Killers (1994)              1.307198
Tank Girl (1995)                         1.277695
Rocky Horror Picture Show, The (1975)    1.260177
Eyes Wide Shut (1999)                    1.259624
Evita (1996)                             1.253631
Billy Madison (1995)                     1.249970
Fear and Loathing in Las Vegas (1998)    1.246408
Bicentennial Man (1999)                  1.245533
Name: rating, dtype: float64

US Baby Names 1880 - 2010¶

In [123]:

path = '/Users/sergulaydore/pydata-book/ch02/names/'

In [122]:

!head -n 10 '/Users/sergulaydore/pydata-book/ch02/names/yob1880.txt'

In [124]:

names1880 = pd.read_csv(path + 'yob1880.txt', names = ['name','sex','births'])

In [126]:

names1880[:10]

Out[126]:

	name	sex	births
0	Mary	F	7065
1	Anna	F	2604
2	Emma	F	2003
3	Elizabeth	F	1939
4	Minnie	F	1746
5	Margaret	F	1578
6	Ida	F	1472
7	Alice	F	1414
8	Bertha	F	1320
9	Sarah	F	1288

In [127]:

names1880.groupby('sex').births.sum()

Out[127]:

sex
F       90993
M      110493
Name: births, dtype: int64

In [132]:

years = range(1880, 2011)

In [133]:

pieces = []
columns = ['name','sex','births']

In [134]:

for year in years:
    path_file = path + 'yob%d.txt' %year
    frame = pd.read_csv(path_file, names=columns)
    frame['year']=year
    pieces.append(frame)

In [135]:

# Concatenate everything into a single DataFrame
names = pd.concat(pieces, ignore_index=True)

In [142]:

total_births = names.pivot_table('births',rows='year',cols='sex',aggfunc=sum)

In [143]:

total_births.tail()

Out[143]:

sex	F	M
year
2006	1896468	2050234
2007	1916888	2069242
2008	1883645	2032310
2009	1827643	1973359
2010	1759010	1898382

In [144]:

total_births.plot(title='Total births by sex and year')

Out[144]:

<matplotlib.axes._subplots.AxesSubplot at 0x114efac10>

Next, let's insert a column prop with the fraction of babies given each name realtive to the total number of births.

In [145]:

def  add_prop(group):
    # integer division floors
    births = group.births.astype('float')
    
    group['prop'] = births / births.sum()
    return group

In [146]:

names = names.groupby(['year','sex']).apply(add_prop)

In [148]:

names.head()

Out[148]:

	name	sex	births	year	prop
0	Mary	F	7065	1880	0.077643
1	Anna	F	2604	1880	0.028618
2	Emma	F	2003	1880	0.022013
3	Elizabeth	F	1939	1880	0.021309
4	Minnie	F	1746	1880	0.019188

In [149]:

np.allclose(names.groupby(['year','sex']).prop.sum(),1)

Out[149]:

True

the top 1000 names for each sex/year combination

In [150]:

def get_top1000(group):
    return group.sort_index(by='births',ascending=False)[:1000]

In [151]:

grouped = names.groupby(['year','sex'])
top1000 = grouped.apply(get_top1000)

In [153]:

top1000[:15]

Out[153]:

			name	sex	births	year	prop
year	sex
1880	F	0	Mary	F	7065	1880	0.077643
		1	Anna	F	2604	1880	0.028618
		2	Emma	F	2003	1880	0.022013
		3	Elizabeth	F	1939	1880	0.021309
		4	Minnie	F	1746	1880	0.019188
		5	Margaret	F	1578	1880	0.017342
		6	Ida	F	1472	1880	0.016177
		7	Alice	F	1414	1880	0.015540
		8	Bertha	F	1320	1880	0.014507
		9	Sarah	F	1288	1880	0.014155
		10	Annie	F	1258	1880	0.013825
		11	Clara	F	1226	1880	0.013474
		12	Ella	F	1156	1880	0.012704
		13	Florence	F	1063	1880	0.011682
		14	Cora	F	1045	1880	0.011484

Analyzing Naming Trends¶

In [155]:

boys = top1000[top1000.sex=='M']

In [156]:

girls = top1000[top1000.sex=='F']

In [157]:

total_births = top1000.pivot_table('births',rows='year',cols='name',aggfunc=sum)

In [158]:

total_births[:10]

Out[158]:

name	Aaden	Aaliyah	Aarav	Aaron	Aarush	Ab	Abagail	Abb	Abbey	Abbie	...	Zoa	Zoe	Zoey	Zoie	Zola	Zollie	Zona	Zora	Zula	Zuri
year
1880	NaN	NaN	NaN	102	NaN	NaN	NaN	NaN	NaN	71	...	8	23	NaN	NaN	7	NaN	8	28	27	NaN
1881	NaN	NaN	NaN	94	NaN	NaN	NaN	NaN	NaN	81	...	NaN	22	NaN	NaN	10	NaN	9	21	27	NaN
1882	NaN	NaN	NaN	85	NaN	NaN	NaN	NaN	NaN	80	...	8	25	NaN	NaN	9	NaN	17	32	21	NaN
1883	NaN	NaN	NaN	105	NaN	NaN	NaN	NaN	NaN	79	...	NaN	23	NaN	NaN	10	NaN	11	35	25	NaN
1884	NaN	NaN	NaN	97	NaN	NaN	NaN	NaN	NaN	98	...	13	31	NaN	NaN	14	6	8	58	27	NaN
1885	NaN	NaN	NaN	88	NaN	6	NaN	NaN	NaN	88	...	6	27	NaN	NaN	12	6	14	48	38	NaN
1886	NaN	NaN	NaN	86	NaN	NaN	NaN	NaN	NaN	84	...	13	25	NaN	NaN	8	NaN	20	52	43	NaN
1887	NaN	NaN	NaN	78	NaN	NaN	NaN	NaN	NaN	104	...	9	34	NaN	NaN	23	NaN	28	46	33	NaN
1888	NaN	NaN	NaN	90	NaN	NaN	NaN	NaN	NaN	137	...	11	42	NaN	NaN	23	7	30	42	45	NaN
1889	NaN	NaN	NaN	85	NaN	NaN	NaN	NaN	NaN	107	...	14	29	NaN	NaN	22	NaN	29	53	55	NaN

10 rows × 6868 columns

In [159]:

subset = total_births[['John','Harry','Mary','Marilyn']]

In [160]:

subset.plot(subplots=True, figsize=(12,10), grid=False, 
            title='Number of births per year')

Out[160]:

array([<matplotlib.axes._subplots.AxesSubplot object at 0x10f360e10>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x10dcca610>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x10298cb10>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x110200410>], dtype=object)

Measuring the increase in naming diversity¶

In [161]:

table = top1000.pivot_table('prop',rows='year',cols='sex',aggfunc=sum)

In [164]:

table.plot(title='Sum of table1000.prop by year ans sex',
           yticks=np.linspace(0,1.2,13), xticks=range(1880,2020,10))

Out[164]:

<matplotlib.axes._subplots.AxesSubplot at 0x10fc64a50>

In [165]:

df = boys[boys.year==2010]

In [166]:

df[:10]

Out[166]:

			name	sex	births	year	prop
year	sex
2010	M	1676644	Jacob	M	21875	2010	0.011523
		1676645	Ethan	M	17866	2010	0.009411
		1676646	Michael	M	17133	2010	0.009025
		1676647	Jayden	M	17030	2010	0.008971
		1676648	William	M	16870	2010	0.008887
		1676649	Alexander	M	16634	2010	0.008762
		1676650	Noah	M	16281	2010	0.008576
		1676651	Daniel	M	15679	2010	0.008259
		1676652	Aiden	M	15403	2010	0.008114
		1676653	Anthony	M	15364	2010	0.008093

In [167]:

prop_cumsum = df.sort_index(by='prop',ascending=False).prop.cumsum()

In [168]:

prop_cumsum[:10]

Out[168]:

year  sex         
2010  M    1676644    0.011523
           1676645    0.020934
           1676646    0.029959
           1676647    0.038930
           1676648    0.047817
           1676649    0.056579
           1676650    0.065155
           1676651    0.073414
           1676652    0.081528
           1676653    0.089621
Name: prop, dtype: float64

In [171]:

prop_cumsum.values.searchsorted(0.5)

Out[171]:

In [172]:

def get_quantile_count(group, q=0.5):
    group = group.sort_index(by='prop',ascending=False)
    return group.prop.cumsum().values.searchsorted(q)+1

In [173]:

diversity = top1000.groupby(['year','sex']).apply(get_quantile_count)

In [174]:

diversity.head()

Out[174]:

year  sex
1880  F      38
      M      14
1881  F      38
      M      14
1882  F      38
dtype: int64

In [175]:

diversity = diversity.unstack('sex')

In [176]:

diversity.head()

Out[176]:

sex	F	M
year
1880	38	14
1881	38	14
1882	38	15
1883	39	15
1884	39	16

In [177]:

diversity.plot(title='Number of popular names in top 50%')

Out[177]:

<matplotlib.axes._subplots.AxesSubplot at 0x1124d0050>

The "last letter" revolution¶

In [178]:

# extract last letter from name column
get_last_letter = lambda x: x[-1]
last_letters = names.name.map(get_last_letter)
last_letters.name='last_letter'

In [179]:

table = names.pivot_table('births', rows=last_letters,
                          cols=['sex','year'], aggfunc=sum)

In [180]:

table[:10]

Out[180]:

sex	F										...	M
year	1880	1881	1882	1883	1884	1885	1886	1887	1888	1889	...	2001	2002	2003	2004	2005	2006	2007	2008	2009	2010
last_letter
a	31446	31581	36536	38330	43680	45408	49100	48942	59442	58631	...	39124	38815	37825	38650	36838	36156	34654	32901	31430	28438
b	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	50950	49284	48065	45914	43144	42600	42123	39945	38862	38859
c	NaN	NaN	5	5	NaN	NaN	NaN	NaN	NaN	NaN	...	27113	27238	27697	26778	26078	26635	26864	25318	24048	23125
d	609	607	734	810	916	862	1007	1027	1298	1374	...	60838	55829	53391	51754	50670	51410	50595	47910	46172	44398
e	33378	34080	40399	41914	48089	49616	53884	54353	66750	66663	...	145395	144651	144769	142098	141123	142999	143698	140966	135496	129012
f	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	1758	1817	1819	1904	1985	1968	2090	2195	2212	2255
g	7	5	12	8	24	11	18	25	44	28	...	2151	2084	2009	1837	1882	1929	2040	2059	2396	2666
h	4863	4784	5567	5701	6602	6624	7146	7141	8630	8826	...	85959	88085	88226	89620	92497	98477	99414	100250	99979	98090
i	61	78	81	76	84	92	85	105	141	134	...	20980	23610	26011	28500	31317	33558	35231	38151	40912	42956
j	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	1069	1088	1203	1094	1291	1241	1254	1381	1416	1459

10 rows × 262 columns

In [181]:

subtable = table.reindex(columns=[1910, 1960, 2010], level='year')

In [182]:

subtable.head()

Out[182]:

sex	F			M
year	1910	1960	2010	1910	1960	2010
last_letter
a	108376	691247	670605	977	5204	28438
b	NaN	694	450	411	3912	38859
c	5	49	946	482	15476	23125
d	6750	3729	2607	22111	262112	44398
e	133569	435013	313833	28655	178823	129012

In [183]:

subtable.sum()

Out[183]:

sex  year
F    1910     396416
     1960    2022062
     2010    1759010
M    1910     194198
     1960    2132588
     2010    1898382
dtype: float64

In [184]:

letter_prop = subtable/subtable.sum().astype(float)

In [188]:

import matplotlib.pyplot as plt

In [190]:

fig, axes = plt.subplots(2,1,figsize=(10,8))
letter_prop['M'].plot(kind='bar',rot=0,ax=axes[0],title='Male')
letter_prop['F'].plot(kind='bar',rot=0,ax=axes[1],title='Female',
                      legend=False)

Out[190]:

<matplotlib.axes._subplots.AxesSubplot at 0x117c9cad0>

In [191]:

letter_prop = table/table.sum().astype(float)

In [192]:

dny_ts = letter_prop.ix[['d','n','y'],'M'].T

In [193]:

dny_ts.head()

Out[193]:

	d	n	y
year
1880	0.083055	0.153213	0.075760
1881	0.083247	0.153214	0.077451
1882	0.085340	0.149560	0.077537
1883	0.084066	0.151646	0.079144
1884	0.086120	0.149915	0.080405

In [194]:

dny_ts.plot()

Out[194]:

<matplotlib.axes._subplots.AxesSubplot at 0x116ca0f90>

Boy names that became girl names (and vice versa)¶

In [195]:

all_names = top1000.name.unique()

In [196]:

mask = np.array(['lesl' in x.lower() for x in all_names])

In [197]:

lesley_like = all_names[mask]

In [198]:

lesley_like

Out[198]:

array(['Leslie', 'Lesley', 'Leslee', 'Lesli', 'Lesly'], dtype=object)

In [199]:

filtered = top1000[top1000.name.isin(lesley_like)]

In [200]:

filtered.groupby('name').births.sum()

Out[200]:

name
Leslee      1082
Lesley     35022
Lesli        929
Leslie    370429
Lesly      10067
Name: births, dtype: int64

In [201]:

table = filtered.pivot_table('births',rows='year',
                             cols='sex', aggfunc='sum')

In [202]:

table = table.div(table.sum(1),axis=0)

In [203]:

table.tail()

Out[203]:

sex	F	M
year
2006	1	NaN
2007	1	NaN
2008	1	NaN
2009	1	NaN
2010	1	NaN

In [204]:

table.plot(style={'M':'k-','F':'k--'})

Out[204]:

<matplotlib.axes._subplots.AxesSubplot at 0x11379ed10>

In [205]:

pwd

Out[205]:

u'/Users/sergulaydore/python_for_data_analysis'

In [ ]: