%pylab inline
Populating the interactive namespace from numpy and matplotlib
A feed of anonymous data gathered from users who shorten links ending with .gov and .mil.
path = '/Users/sergulaydore/pydata-book/ch02/usagov_bitly_data2012-03-16-1331923249.txt'
open(path).readline() # each line containsa common form of web data known as JSON (JavaScript Object Notation)
'{ "a": "Mozilla\\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\\/535.11 (KHTML, like Gecko) Chrome\\/17.0.963.78 Safari\\/535.11", "c": "US", "nk": 1, "tz": "America\\/New_York", "gr": "MA", "g": "A6qOVH", "h": "wfLQtf", "l": "orofrog", "al": "en-US,en;q=0.8", "hh": "1.usa.gov", "r": "http:\\/\\/www.facebook.com\\/l\\/7AQEFzjSi\\/1.usa.gov\\/wfLQtf", "u": "http:\\/\\/www.ncbi.nlm.nih.gov\\/pubmed\\/22415991", "t": 1331923247, "hc": 1331822918, "cy": "Danvers", "ll": [ 42.576698, -70.954903 ] }\n'
import json
record = [json.loads(line) for line in open(path)]
record[0]
{u'a': u'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.78 Safari/535.11', u'al': u'en-US,en;q=0.8', u'c': u'US', u'cy': u'Danvers', u'g': u'A6qOVH', u'gr': u'MA', u'h': u'wfLQtf', u'hc': 1331822918, u'hh': u'1.usa.gov', u'l': u'orofrog', u'll': [42.576698, -70.954903], u'nk': 1, u'r': u'http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/wfLQtf', u't': 1331923247, u'tz': u'America/New_York', u'u': u'http://www.ncbi.nlm.nih.gov/pubmed/22415991'}
The resulting object records is now a list of Python dicts.
record[0]['tz']
u'America/New_York'
time_zones = [rec['tz'] for rec in record if 'tz' in rec]
time_zones[:10]
[u'America/New_York', u'America/Denver', u'America/New_York', u'America/Sao_Paulo', u'America/New_York', u'America/New_York', u'Europe/Warsaw', u'', u'', u'']
def get_counts(sequence):
counts = {}
for x in sequence:
if x in counts:
counts[x] +=1
else:
counts[x] = 1
return counts
from collections import defaultdict
def get_counts2(sequence):
counts = defaultdict(int) # values will initialize to 0
for x in sequence:
counts[x] +=1
return counts
counts = get_counts(time_zones)
counts['America/New_York']
1251
len(time_zones)
3440
If we wanted the top ten time zones and their counts
def top_counts(counts_dict,n=10):
value_key_pairs = [(count, tz) for tz,count in counts_dict.items()]
value_key_pairs.sort()
return value_key_pairs[-n:]
top_counts(counts)
[(33, u'America/Sao_Paulo'), (35, u'Europe/Madrid'), (36, u'Pacific/Honolulu'), (37, u'Asia/Tokyo'), (74, u'Europe/London'), (191, u'America/Denver'), (382, u'America/Los_Angeles'), (400, u'America/Chicago'), (521, u''), (1251, u'America/New_York')]
collections.Counter class makes this tak a lot easier.
from collections import Counter
counts = Counter(time_zones)
counts.most_common(10)
[(u'America/New_York', 1251), (u'', 521), (u'America/Chicago', 400), (u'America/Los_Angeles', 382), (u'America/Denver', 191), (u'Europe/London', 74), (u'Asia/Tokyo', 37), (u'Pacific/Honolulu', 36), (u'Europe/Madrid', 35), (u'America/Sao_Paulo', 33)]
from pandas import DataFrame, Series
import pandas as pd; import numpy as np
frame = DataFrame(record)
frame.head()
_heartbeat_ | a | al | c | cy | g | gr | h | hc | hh | kw | l | ll | nk | r | t | tz | u | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | NaN | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... | en-US,en;q=0.8 | US | Danvers | A6qOVH | MA | wfLQtf | 1331822918 | 1.usa.gov | NaN | orofrog | [42.576698, -70.954903] | 1 | http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/... | 1331923247 | America/New_York | http://www.ncbi.nlm.nih.gov/pubmed/22415991 |
1 | NaN | GoogleMaps/RochesterNY | NaN | US | Provo | mwszkS | UT | mwszkS | 1308262393 | j.mp | NaN | bitly | [40.218102, -111.613297] | 0 | http://www.AwareMap.com/ | 1331923249 | America/Denver | http://www.monroecounty.gov/etc/911/rss.php |
2 | NaN | Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ... | en-US | US | Washington | xxr3Qb | DC | xxr3Qb | 1331919941 | 1.usa.gov | NaN | bitly | [38.9007, -77.043098] | 1 | http://t.co/03elZC4Q | 1331923250 | America/New_York | http://boxer.senate.gov/en/press/releases/0316... |
3 | NaN | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8)... | pt-br | BR | Braz | zCaLwp | 27 | zUtuOu | 1331923068 | 1.usa.gov | NaN | alelex88 | [-23.549999, -46.616699] | 0 | direct | 1331923249 | America/Sao_Paulo | http://apod.nasa.gov/apod/ap120312.html |
4 | NaN | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... | en-US,en;q=0.8 | US | Shrewsbury | 9b6kNl | MA | 9b6kNl | 1273672411 | bit.ly | NaN | bitly | [42.286499, -71.714699] | 0 | http://www.shrewsbury-ma.gov/selco/ | 1331923251 | America/New_York | http://www.shrewsbury-ma.gov/egov/gallery/1341... |
frame['tz'][:10]
0 America/New_York 1 America/Denver 2 America/New_York 3 America/Sao_Paulo 4 America/New_York 5 America/New_York 6 Europe/Warsaw 7 8 9 Name: tz, dtype: object
tz_counts = frame['tz'].value_counts()
tz_counts[:10]
America/New_York 1251 521 America/Chicago 400 America/Los_Angeles 382 America/Denver 191 Europe/London 74 Asia/Tokyo 37 Pacific/Honolulu 36 Europe/Madrid 35 America/Sao_Paulo 33 dtype: int64
clean_tz = frame['tz'].fillna('Missing')
clean_tz[clean_tz=='']='Unknown'
tz_counts = clean_tz.value_counts()
tz_counts[:10]
America/New_York 1251 Unknown 521 America/Chicago 400 America/Los_Angeles 382 America/Denver 191 Missing 120 Europe/London 74 Asia/Tokyo 37 Pacific/Honolulu 36 Europe/Madrid 35 dtype: int64
tz_counts[:10].plot(kind='barh',rot=0)
<matplotlib.axes._subplots.AxesSubplot at 0x1083abd90>
The a field contains information about the browser, device, or application used to perform the URL shortening:
frame['a'][1]
u'GoogleMaps/RochesterNY'
frame['a'][50]
u'Mozilla/5.0 (Windows NT 5.1; rv:10.0.2) Gecko/20100101 Firefox/10.0.2'
frame['a'][51]
u'Mozilla/5.0 (Linux; U; Android 2.2.2; en-us; LG-P925/V10e Build/FRG83G) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1'
results = Series([x.split()[0] for x in frame.a.dropna()])
results[:5]
0 Mozilla/5.0 1 GoogleMaps/RochesterNY 2 Mozilla/4.0 3 Mozilla/5.0 4 Mozilla/5.0 dtype: object
results.value_counts()[:8]
Mozilla/5.0 2594 Mozilla/4.0 601 GoogleMaps/RochesterNY 121 Opera/9.80 34 TEST_INTERNET_AGENT 24 GoogleProducer 21 Mozilla/6.0 5 BlackBerry8520/5.0.0.681 4 dtype: int64
Now, suppose you wanted to decompose the top time zones into Windows and non-windows users.
cframe = frame[frame.a.notnull()]
operating_system = np.where(cframe['a'].str.contains('Windows'), 'Windows','Not Windows')
operating_system[:5]
array(['Windows', 'Not Windows', 'Windows', 'Not Windows', 'Windows'], dtype='|S11')
Then, you can group the data by its time zone column and this new list of operating systems:
by_tz_os = cframe.groupby(['tz', operating_system])
The group counts, analogous to the value counts function above, can be computed using size. This result is then reshaped into a table with unstack:
agg_counts = by_tz_os.size().unstack().fillna(0)
agg_counts[:10]
Not Windows | Windows | |
---|---|---|
tz | ||
245 | 276 | |
Africa/Cairo | 0 | 3 |
Africa/Casablanca | 0 | 1 |
Africa/Ceuta | 0 | 2 |
Africa/Johannesburg | 0 | 1 |
Africa/Lusaka | 0 | 1 |
America/Anchorage | 4 | 1 |
America/Argentina/Buenos_Aires | 1 | 0 |
America/Argentina/Cordoba | 0 | 1 |
America/Argentina/Mendoza | 0 | 1 |
indexer = agg_counts.sum(1).argsort()
indexer[:10]
tz 24 Africa/Cairo 20 Africa/Casablanca 21 Africa/Ceuta 92 Africa/Johannesburg 87 Africa/Lusaka 53 America/Anchorage 54 America/Argentina/Buenos_Aires 57 America/Argentina/Cordoba 26 America/Argentina/Mendoza 55 dtype: int64
I then use take to select the rows in that order, then slice off the last 10 rows.
count_subset = agg_counts.take(indexer)[-10:]
count_subset
Not Windows | Windows | |
---|---|---|
tz | ||
America/Sao_Paulo | 13 | 20 |
Europe/Madrid | 16 | 19 |
Pacific/Honolulu | 0 | 36 |
Asia/Tokyo | 2 | 35 |
Europe/London | 43 | 31 |
America/Denver | 132 | 59 |
America/Los_Angeles | 130 | 252 |
America/Chicago | 115 | 285 |
245 | 276 | |
America/New_York | 339 | 912 |
count_subset.plot(kind = 'barh',stacked = True)
<matplotlib.axes._subplots.AxesSubplot at 0x108d1ab10>
normed_subset = count_subset.div(count_subset.sum(1),axis=0)
normed_subset.plot(kind = 'barh', stacked = True)
<matplotlib.axes._subplots.AxesSubplot at 0x108ec0c50>
path = '/Users/sergulaydore/pydata-book/ch02/movielens/'
unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_table(path + 'users.dat', sep = '::', header = None, names = unames)
/Users/sergulaydore/anaconda/lib/python2.7/site-packages/pandas/io/parsers.py:624: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators; you can avoid this warning by specifying engine='python'. ParserWarning)
rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table(path + 'ratings.dat', sep = '::', header = None, names = rnames)
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table(path + 'movies.dat', sep='::', header = None, names = mnames)
users[:5]
user_id | gender | age | occupation | zip | |
---|---|---|---|---|---|
0 | 1 | F | 1 | 10 | 48067 |
1 | 2 | M | 56 | 16 | 70072 |
2 | 3 | M | 25 | 15 | 55117 |
3 | 4 | M | 45 | 7 | 02460 |
4 | 5 | M | 25 | 20 | 55455 |
ratings[:5]
user_id | movie_id | rating | timestamp | |
---|---|---|---|---|
0 | 1 | 1193 | 5 | 978300760 |
1 | 1 | 661 | 3 | 978302109 |
2 | 1 | 914 | 3 | 978301968 |
3 | 1 | 3408 | 4 | 978300275 |
4 | 1 | 2355 | 5 | 978824291 |
movies[:5]
movie_id | title | genres | |
---|---|---|---|
0 | 1 | Toy Story (1995) | Animation|Children's|Comedy |
1 | 2 | Jumanji (1995) | Adventure|Children's|Fantasy |
2 | 3 | Grumpier Old Men (1995) | Comedy|Romance |
3 | 4 | Waiting to Exhale (1995) | Comedy|Drama |
4 | 5 | Father of the Bride Part II (1995) | Comedy |
data = pd.merge(pd.merge(ratings,users), movies)
data.head()
user_id | movie_id | rating | timestamp | gender | age | occupation | zip | title | genres | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1193 | 5 | 978300760 | F | 1 | 10 | 48067 | One Flew Over the Cuckoo's Nest (1975) | Drama |
1 | 2 | 1193 | 5 | 978298413 | M | 56 | 16 | 70072 | One Flew Over the Cuckoo's Nest (1975) | Drama |
2 | 12 | 1193 | 4 | 978220179 | M | 25 | 12 | 32793 | One Flew Over the Cuckoo's Nest (1975) | Drama |
3 | 15 | 1193 | 4 | 978199279 | M | 25 | 7 | 22903 | One Flew Over the Cuckoo's Nest (1975) | Drama |
4 | 17 | 1193 | 5 | 978158471 | M | 50 | 1 | 95350 | One Flew Over the Cuckoo's Nest (1975) | Drama |
data.ix[0]
user_id 1 movie_id 1193 rating 5 timestamp 978300760 gender F age 1 occupation 10 zip 48067 title One Flew Over the Cuckoo's Nest (1975) genres Drama Name: 0, dtype: object
To get mean movie ratings for each film grouped by gender, we can use the pivot_table method:
mean_ratings = data.pivot_table('rating',rows='title',cols='gender',aggfunc = 'mean')
/Users/sergulaydore/anaconda/lib/python2.7/site-packages/pandas/util/decorators.py:53: FutureWarning: cols is deprecated, use columns instead warnings.warn(msg, FutureWarning) /Users/sergulaydore/anaconda/lib/python2.7/site-packages/pandas/util/decorators.py:53: FutureWarning: rows is deprecated, use index instead warnings.warn(msg, FutureWarning)
mean_ratings[:5]
gender | F | M |
---|---|---|
title | ||
$1,000,000 Duck (1971) | 3.375000 | 2.761905 |
'Night Mother (1986) | 3.388889 | 3.352941 |
'Til There Was You (1997) | 2.675676 | 2.733333 |
'burbs, The (1989) | 2.793478 | 2.962085 |
...And Justice for All (1979) | 3.828571 | 3.689024 |
ratings_by_title = data.groupby('title').size()
ratings_by_title[:10]
title $1,000,000 Duck (1971) 37 'Night Mother (1986) 70 'Til There Was You (1997) 52 'burbs, The (1989) 303 ...And Justice for All (1979) 199 1-900 (1994) 2 10 Things I Hate About You (1999) 700 101 Dalmatians (1961) 565 101 Dalmatians (1996) 364 12 Angry Men (1957) 616 dtype: int64
active_titles = ratings_by_title.index[ratings_by_title >= 250]
active_titles
Index([u''burbs, The (1989)', u'10 Things I Hate About You (1999)', u'101 Dalmatians (1961)', u'101 Dalmatians (1996)', u'12 Angry Men (1957)', u'13th Warrior, The (1999)', u'2 Days in the Valley (1996)', u'20,000 Leagues Under the Sea (1954)', u'2001: A Space Odyssey (1968)', u'2010 (1984)', u'28 Days (2000)', u'39 Steps, The (1935)', u'54 (1998)', u'7th Voyage of Sinbad, The (1958)', u'8MM (1999)', u'About Last Night... (1986)', u'Absent Minded Professor, The (1961)', u'Absolute Power (1997)', u'Abyss, The (1989)', u'Ace Ventura: Pet Detective (1994)', u'Ace Ventura: When Nature Calls (1995)', u'Addams Family Values (1993)', u'Addams Family, The (1991)', u'Adventures in Babysitting (1987)', u'Adventures of Buckaroo Bonzai Across the 8th Dimension, The (1984)', u'Adventures of Priscilla, Queen of the Desert, The (1994)', u'Adventures of Robin Hood, The (1938)', u'African Queen, The (1951)', u'Age of Innocence, The (1993)', u'Agnes of God (1985)', u'Air America (1990)', u'Air Force One (1997)', u'Airplane II: The Sequel (1982)', u'Airplane! (1980)', u'Akira (1988)', u'Aladdin (1992)', u'Alice in Wonderland (1951)', u'Alien (1979)', u'Alien Nation (1988)', u'Alien: Resurrection (1997)', u'Aliens (1986)', u'Alien� (1992)', u'Alive (1993)', u'All About Eve (1950)', u'All About My Mother (Todo Sobre Mi Madre) (1999)', u'All Quiet on the Western Front (1930)', u'All That Jazz (1979)', u'Almost Famous (2000)', u'Amadeus (1984)', u'American Beauty (1999)', u'American Gigolo (1980)', u'American Graffiti (1973)', u'American History X (1998)', u'American Movie (1999)', u'American Pie (1999)', u'American President, The (1995)', u'American Psycho (2000)', u'American Tail, An (1986)', u'American Werewolf in London, An (1981)', u'American Werewolf in Paris, An (1997)', u'American in Paris, An (1951)', u'Amistad (1997)', u'Amityville Horror, The (1979)', u'Anaconda (1997)', u'Analyze This (1999)', u'Anastasia (1997)', u'And Now for Something Completely Different (1971)', u'Angel Heart (1987)', u'Animal House (1978)', u'Anna and the King (1999)', u'Annie Hall (1977)', u'Antz (1998)', u'Any Given Sunday (1999)', u'Apartment, The (1960)', u'Apocalypse Now (1979)', u'Apollo 13 (1995)', u'Apostle, The (1997)', u'Arachnophobia (1990)', u'Aristocats, The (1970)', u'Arlington Road (1999)', u'Armageddon (1998)', u'Army of Darkness (1993)', u'Around the World in 80 Days (1956)', u'Arrival, The (1996)', u'Arsenic and Old Lace (1944)', u'Arthur (1981)', u'As Good As It Gets (1997)', u'Astronaut's Wife, The (1999)', u'Atlantic City (1980)', u'Auntie Mame (1958)', u'Austin Powers: International Man of Mystery (1997)', u'Austin Powers: The Spy Who Shagged Me (1999)', u'Avengers, The (1998)', u'Awakenings (1990)', u'Babe (1995)', u'Babe: Pig in the City (1998)', u'Bachelor Party (1984)', u'Bachelor, The (1999)', u'Back to School (1986)', u'Back to the Future (1985)', ...], dtype='object')
The index of titles receiving at least 250 ratings can then be used to select rows from mean_ratings above:
mean_ratings = mean_ratings.ix[active_titles]
mean_ratings.head()
gender | F | M |
---|---|---|
title | ||
'burbs, The (1989) | 2.793478 | 2.962085 |
10 Things I Hate About You (1999) | 3.646552 | 3.311966 |
101 Dalmatians (1961) | 3.791444 | 3.500000 |
101 Dalmatians (1996) | 3.240000 | 2.911215 |
12 Angry Men (1957) | 4.184397 | 4.328421 |
To see the top films among female viewers, we can sort by the F column in descending order:
top_female_ratings = mean_ratings.sort_index(by = 'F', ascending = False)
top_female_ratings[:10]
gender | F | M |
---|---|---|
title | ||
Close Shave, A (1995) | 4.644444 | 4.473795 |
Wrong Trousers, The (1993) | 4.588235 | 4.478261 |
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950) | 4.572650 | 4.464589 |
Wallace & Gromit: The Best of Aardman Animation (1996) | 4.563107 | 4.385075 |
Schindler's List (1993) | 4.562602 | 4.491415 |
Shawshank Redemption, The (1994) | 4.539075 | 4.560625 |
Grand Day Out, A (1992) | 4.537879 | 4.293255 |
To Kill a Mockingbird (1962) | 4.536667 | 4.372611 |
Creature Comforts (1990) | 4.513889 | 4.272277 |
Usual Suspects, The (1995) | 4.513317 | 4.518248 |
mean_ratings['diff'] = mean_ratings['M']-mean_ratings['F']
sorted_by_diff = mean_ratings.sort_index(by='diff')
sorted_by_diff[:15]
gender | F | M | diff |
---|---|---|---|
title | |||
Dirty Dancing (1987) | 3.790378 | 2.959596 | -0.830782 |
Jumpin' Jack Flash (1986) | 3.254717 | 2.578358 | -0.676359 |
Grease (1978) | 3.975265 | 3.367041 | -0.608224 |
Little Women (1994) | 3.870588 | 3.321739 | -0.548849 |
Steel Magnolias (1989) | 3.901734 | 3.365957 | -0.535777 |
Anastasia (1997) | 3.800000 | 3.281609 | -0.518391 |
Rocky Horror Picture Show, The (1975) | 3.673016 | 3.160131 | -0.512885 |
Color Purple, The (1985) | 4.158192 | 3.659341 | -0.498851 |
Age of Innocence, The (1993) | 3.827068 | 3.339506 | -0.487561 |
Free Willy (1993) | 2.921348 | 2.438776 | -0.482573 |
French Kiss (1995) | 3.535714 | 3.056962 | -0.478752 |
Little Shop of Horrors, The (1960) | 3.650000 | 3.179688 | -0.470312 |
Guys and Dolls (1955) | 4.051724 | 3.583333 | -0.468391 |
Mary Poppins (1964) | 4.197740 | 3.730594 | -0.467147 |
Patch Adams (1998) | 3.473282 | 3.008746 | -0.464536 |
sorted_by_diff[::-1][:15]
gender | F | M | diff |
---|---|---|---|
title | |||
Good, The Bad and The Ugly, The (1966) | 3.494949 | 4.221300 | 0.726351 |
Kentucky Fried Movie, The (1977) | 2.878788 | 3.555147 | 0.676359 |
Dumb & Dumber (1994) | 2.697987 | 3.336595 | 0.638608 |
Longest Day, The (1962) | 3.411765 | 4.031447 | 0.619682 |
Cable Guy, The (1996) | 2.250000 | 2.863787 | 0.613787 |
Evil Dead II (Dead By Dawn) (1987) | 3.297297 | 3.909283 | 0.611985 |
Hidden, The (1987) | 3.137931 | 3.745098 | 0.607167 |
Rocky III (1982) | 2.361702 | 2.943503 | 0.581801 |
Caddyshack (1980) | 3.396135 | 3.969737 | 0.573602 |
For a Few Dollars More (1965) | 3.409091 | 3.953795 | 0.544704 |
Porky's (1981) | 2.296875 | 2.836364 | 0.539489 |
Animal House (1978) | 3.628906 | 4.167192 | 0.538286 |
Exorcist, The (1973) | 3.537634 | 4.067239 | 0.529605 |
Fright Night (1985) | 2.973684 | 3.500000 | 0.526316 |
Barb Wire (1996) | 1.585366 | 2.100386 | 0.515020 |
rating_std_by_title = data.groupby('title')['rating'].std()
rating_std_by_title = rating_std_by_title.ix[active_titles]
rating_std_by_title.order(ascending=False)[:10]
title Dumb & Dumber (1994) 1.321333 Blair Witch Project, The (1999) 1.316368 Natural Born Killers (1994) 1.307198 Tank Girl (1995) 1.277695 Rocky Horror Picture Show, The (1975) 1.260177 Eyes Wide Shut (1999) 1.259624 Evita (1996) 1.253631 Billy Madison (1995) 1.249970 Fear and Loathing in Las Vegas (1998) 1.246408 Bicentennial Man (1999) 1.245533 Name: rating, dtype: float64
path = '/Users/sergulaydore/pydata-book/ch02/names/'
!head -n 10 '/Users/sergulaydore/pydata-book/ch02/names/yob1880.txt'
names1880 = pd.read_csv(path + 'yob1880.txt', names = ['name','sex','births'])
names1880[:10]
name | sex | births | |
---|---|---|---|
0 | Mary | F | 7065 |
1 | Anna | F | 2604 |
2 | Emma | F | 2003 |
3 | Elizabeth | F | 1939 |
4 | Minnie | F | 1746 |
5 | Margaret | F | 1578 |
6 | Ida | F | 1472 |
7 | Alice | F | 1414 |
8 | Bertha | F | 1320 |
9 | Sarah | F | 1288 |
names1880.groupby('sex').births.sum()
sex F 90993 M 110493 Name: births, dtype: int64
years = range(1880, 2011)
pieces = []
columns = ['name','sex','births']
for year in years:
path_file = path + 'yob%d.txt' %year
frame = pd.read_csv(path_file, names=columns)
frame['year']=year
pieces.append(frame)
# Concatenate everything into a single DataFrame
names = pd.concat(pieces, ignore_index=True)
total_births = names.pivot_table('births',rows='year',cols='sex',aggfunc=sum)
total_births.tail()
sex | F | M |
---|---|---|
year | ||
2006 | 1896468 | 2050234 |
2007 | 1916888 | 2069242 |
2008 | 1883645 | 2032310 |
2009 | 1827643 | 1973359 |
2010 | 1759010 | 1898382 |
total_births.plot(title='Total births by sex and year')
<matplotlib.axes._subplots.AxesSubplot at 0x114efac10>
Next, let's insert a column prop with the fraction of babies given each name realtive to the total number of births.
def add_prop(group):
# integer division floors
births = group.births.astype('float')
group['prop'] = births / births.sum()
return group
names = names.groupby(['year','sex']).apply(add_prop)
names.head()
name | sex | births | year | prop | |
---|---|---|---|---|---|
0 | Mary | F | 7065 | 1880 | 0.077643 |
1 | Anna | F | 2604 | 1880 | 0.028618 |
2 | Emma | F | 2003 | 1880 | 0.022013 |
3 | Elizabeth | F | 1939 | 1880 | 0.021309 |
4 | Minnie | F | 1746 | 1880 | 0.019188 |
np.allclose(names.groupby(['year','sex']).prop.sum(),1)
True
the top 1000 names for each sex/year combination
def get_top1000(group):
return group.sort_index(by='births',ascending=False)[:1000]
grouped = names.groupby(['year','sex'])
top1000 = grouped.apply(get_top1000)
top1000[:15]
name | sex | births | year | prop | |||
---|---|---|---|---|---|---|---|
year | sex | ||||||
1880 | F | 0 | Mary | F | 7065 | 1880 | 0.077643 |
1 | Anna | F | 2604 | 1880 | 0.028618 | ||
2 | Emma | F | 2003 | 1880 | 0.022013 | ||
3 | Elizabeth | F | 1939 | 1880 | 0.021309 | ||
4 | Minnie | F | 1746 | 1880 | 0.019188 | ||
5 | Margaret | F | 1578 | 1880 | 0.017342 | ||
6 | Ida | F | 1472 | 1880 | 0.016177 | ||
7 | Alice | F | 1414 | 1880 | 0.015540 | ||
8 | Bertha | F | 1320 | 1880 | 0.014507 | ||
9 | Sarah | F | 1288 | 1880 | 0.014155 | ||
10 | Annie | F | 1258 | 1880 | 0.013825 | ||
11 | Clara | F | 1226 | 1880 | 0.013474 | ||
12 | Ella | F | 1156 | 1880 | 0.012704 | ||
13 | Florence | F | 1063 | 1880 | 0.011682 | ||
14 | Cora | F | 1045 | 1880 | 0.011484 |
boys = top1000[top1000.sex=='M']
girls = top1000[top1000.sex=='F']
total_births = top1000.pivot_table('births',rows='year',cols='name',aggfunc=sum)
total_births[:10]
name | Aaden | Aaliyah | Aarav | Aaron | Aarush | Ab | Abagail | Abb | Abbey | Abbie | ... | Zoa | Zoe | Zoey | Zoie | Zola | Zollie | Zona | Zora | Zula | Zuri |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
year | |||||||||||||||||||||
1880 | NaN | NaN | NaN | 102 | NaN | NaN | NaN | NaN | NaN | 71 | ... | 8 | 23 | NaN | NaN | 7 | NaN | 8 | 28 | 27 | NaN |
1881 | NaN | NaN | NaN | 94 | NaN | NaN | NaN | NaN | NaN | 81 | ... | NaN | 22 | NaN | NaN | 10 | NaN | 9 | 21 | 27 | NaN |
1882 | NaN | NaN | NaN | 85 | NaN | NaN | NaN | NaN | NaN | 80 | ... | 8 | 25 | NaN | NaN | 9 | NaN | 17 | 32 | 21 | NaN |
1883 | NaN | NaN | NaN | 105 | NaN | NaN | NaN | NaN | NaN | 79 | ... | NaN | 23 | NaN | NaN | 10 | NaN | 11 | 35 | 25 | NaN |
1884 | NaN | NaN | NaN | 97 | NaN | NaN | NaN | NaN | NaN | 98 | ... | 13 | 31 | NaN | NaN | 14 | 6 | 8 | 58 | 27 | NaN |
1885 | NaN | NaN | NaN | 88 | NaN | 6 | NaN | NaN | NaN | 88 | ... | 6 | 27 | NaN | NaN | 12 | 6 | 14 | 48 | 38 | NaN |
1886 | NaN | NaN | NaN | 86 | NaN | NaN | NaN | NaN | NaN | 84 | ... | 13 | 25 | NaN | NaN | 8 | NaN | 20 | 52 | 43 | NaN |
1887 | NaN | NaN | NaN | 78 | NaN | NaN | NaN | NaN | NaN | 104 | ... | 9 | 34 | NaN | NaN | 23 | NaN | 28 | 46 | 33 | NaN |
1888 | NaN | NaN | NaN | 90 | NaN | NaN | NaN | NaN | NaN | 137 | ... | 11 | 42 | NaN | NaN | 23 | 7 | 30 | 42 | 45 | NaN |
1889 | NaN | NaN | NaN | 85 | NaN | NaN | NaN | NaN | NaN | 107 | ... | 14 | 29 | NaN | NaN | 22 | NaN | 29 | 53 | 55 | NaN |
10 rows × 6868 columns
subset = total_births[['John','Harry','Mary','Marilyn']]
subset.plot(subplots=True, figsize=(12,10), grid=False,
title='Number of births per year')
array([<matplotlib.axes._subplots.AxesSubplot object at 0x10f360e10>, <matplotlib.axes._subplots.AxesSubplot object at 0x10dcca610>, <matplotlib.axes._subplots.AxesSubplot object at 0x10298cb10>, <matplotlib.axes._subplots.AxesSubplot object at 0x110200410>], dtype=object)
table = top1000.pivot_table('prop',rows='year',cols='sex',aggfunc=sum)
table.plot(title='Sum of table1000.prop by year ans sex',
yticks=np.linspace(0,1.2,13), xticks=range(1880,2020,10))
<matplotlib.axes._subplots.AxesSubplot at 0x10fc64a50>
df = boys[boys.year==2010]
df[:10]
name | sex | births | year | prop | |||
---|---|---|---|---|---|---|---|
year | sex | ||||||
2010 | M | 1676644 | Jacob | M | 21875 | 2010 | 0.011523 |
1676645 | Ethan | M | 17866 | 2010 | 0.009411 | ||
1676646 | Michael | M | 17133 | 2010 | 0.009025 | ||
1676647 | Jayden | M | 17030 | 2010 | 0.008971 | ||
1676648 | William | M | 16870 | 2010 | 0.008887 | ||
1676649 | Alexander | M | 16634 | 2010 | 0.008762 | ||
1676650 | Noah | M | 16281 | 2010 | 0.008576 | ||
1676651 | Daniel | M | 15679 | 2010 | 0.008259 | ||
1676652 | Aiden | M | 15403 | 2010 | 0.008114 | ||
1676653 | Anthony | M | 15364 | 2010 | 0.008093 |
prop_cumsum = df.sort_index(by='prop',ascending=False).prop.cumsum()
prop_cumsum[:10]
year sex 2010 M 1676644 0.011523 1676645 0.020934 1676646 0.029959 1676647 0.038930 1676648 0.047817 1676649 0.056579 1676650 0.065155 1676651 0.073414 1676652 0.081528 1676653 0.089621 Name: prop, dtype: float64
prop_cumsum.values.searchsorted(0.5)
116
def get_quantile_count(group, q=0.5):
group = group.sort_index(by='prop',ascending=False)
return group.prop.cumsum().values.searchsorted(q)+1
diversity = top1000.groupby(['year','sex']).apply(get_quantile_count)
diversity.head()
year sex 1880 F 38 M 14 1881 F 38 M 14 1882 F 38 dtype: int64
diversity = diversity.unstack('sex')
diversity.head()
sex | F | M |
---|---|---|
year | ||
1880 | 38 | 14 |
1881 | 38 | 14 |
1882 | 38 | 15 |
1883 | 39 | 15 |
1884 | 39 | 16 |
diversity.plot(title='Number of popular names in top 50%')
<matplotlib.axes._subplots.AxesSubplot at 0x1124d0050>
# extract last letter from name column
get_last_letter = lambda x: x[-1]
last_letters = names.name.map(get_last_letter)
last_letters.name='last_letter'
table = names.pivot_table('births', rows=last_letters,
cols=['sex','year'], aggfunc=sum)
table[:10]
sex | F | ... | M | ||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
year | 1880 | 1881 | 1882 | 1883 | 1884 | 1885 | 1886 | 1887 | 1888 | 1889 | ... | 2001 | 2002 | 2003 | 2004 | 2005 | 2006 | 2007 | 2008 | 2009 | 2010 |
last_letter | |||||||||||||||||||||
a | 31446 | 31581 | 36536 | 38330 | 43680 | 45408 | 49100 | 48942 | 59442 | 58631 | ... | 39124 | 38815 | 37825 | 38650 | 36838 | 36156 | 34654 | 32901 | 31430 | 28438 |
b | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 50950 | 49284 | 48065 | 45914 | 43144 | 42600 | 42123 | 39945 | 38862 | 38859 |
c | NaN | NaN | 5 | 5 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 27113 | 27238 | 27697 | 26778 | 26078 | 26635 | 26864 | 25318 | 24048 | 23125 |
d | 609 | 607 | 734 | 810 | 916 | 862 | 1007 | 1027 | 1298 | 1374 | ... | 60838 | 55829 | 53391 | 51754 | 50670 | 51410 | 50595 | 47910 | 46172 | 44398 |
e | 33378 | 34080 | 40399 | 41914 | 48089 | 49616 | 53884 | 54353 | 66750 | 66663 | ... | 145395 | 144651 | 144769 | 142098 | 141123 | 142999 | 143698 | 140966 | 135496 | 129012 |
f | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 1758 | 1817 | 1819 | 1904 | 1985 | 1968 | 2090 | 2195 | 2212 | 2255 |
g | 7 | 5 | 12 | 8 | 24 | 11 | 18 | 25 | 44 | 28 | ... | 2151 | 2084 | 2009 | 1837 | 1882 | 1929 | 2040 | 2059 | 2396 | 2666 |
h | 4863 | 4784 | 5567 | 5701 | 6602 | 6624 | 7146 | 7141 | 8630 | 8826 | ... | 85959 | 88085 | 88226 | 89620 | 92497 | 98477 | 99414 | 100250 | 99979 | 98090 |
i | 61 | 78 | 81 | 76 | 84 | 92 | 85 | 105 | 141 | 134 | ... | 20980 | 23610 | 26011 | 28500 | 31317 | 33558 | 35231 | 38151 | 40912 | 42956 |
j | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 1069 | 1088 | 1203 | 1094 | 1291 | 1241 | 1254 | 1381 | 1416 | 1459 |
10 rows × 262 columns
subtable = table.reindex(columns=[1910, 1960, 2010], level='year')
subtable.head()
sex | F | M | ||||
---|---|---|---|---|---|---|
year | 1910 | 1960 | 2010 | 1910 | 1960 | 2010 |
last_letter | ||||||
a | 108376 | 691247 | 670605 | 977 | 5204 | 28438 |
b | NaN | 694 | 450 | 411 | 3912 | 38859 |
c | 5 | 49 | 946 | 482 | 15476 | 23125 |
d | 6750 | 3729 | 2607 | 22111 | 262112 | 44398 |
e | 133569 | 435013 | 313833 | 28655 | 178823 | 129012 |
subtable.sum()
sex year F 1910 396416 1960 2022062 2010 1759010 M 1910 194198 1960 2132588 2010 1898382 dtype: float64
letter_prop = subtable/subtable.sum().astype(float)
import matplotlib.pyplot as plt
fig, axes = plt.subplots(2,1,figsize=(10,8))
letter_prop['M'].plot(kind='bar',rot=0,ax=axes[0],title='Male')
letter_prop['F'].plot(kind='bar',rot=0,ax=axes[1],title='Female',
legend=False)
<matplotlib.axes._subplots.AxesSubplot at 0x117c9cad0>
letter_prop = table/table.sum().astype(float)
dny_ts = letter_prop.ix[['d','n','y'],'M'].T
dny_ts.head()
d | n | y | |
---|---|---|---|
year | |||
1880 | 0.083055 | 0.153213 | 0.075760 |
1881 | 0.083247 | 0.153214 | 0.077451 |
1882 | 0.085340 | 0.149560 | 0.077537 |
1883 | 0.084066 | 0.151646 | 0.079144 |
1884 | 0.086120 | 0.149915 | 0.080405 |
dny_ts.plot()
<matplotlib.axes._subplots.AxesSubplot at 0x116ca0f90>
all_names = top1000.name.unique()
mask = np.array(['lesl' in x.lower() for x in all_names])
lesley_like = all_names[mask]
lesley_like
array(['Leslie', 'Lesley', 'Leslee', 'Lesli', 'Lesly'], dtype=object)
filtered = top1000[top1000.name.isin(lesley_like)]
filtered.groupby('name').births.sum()
name Leslee 1082 Lesley 35022 Lesli 929 Leslie 370429 Lesly 10067 Name: births, dtype: int64
table = filtered.pivot_table('births',rows='year',
cols='sex', aggfunc='sum')
table = table.div(table.sum(1),axis=0)
table.tail()
sex | F | M |
---|---|---|
year | ||
2006 | 1 | NaN |
2007 | 1 | NaN |
2008 | 1 | NaN |
2009 | 1 | NaN |
2010 | 1 | NaN |
table.plot(style={'M':'k-','F':'k--'})
<matplotlib.axes._subplots.AxesSubplot at 0x11379ed10>
pwd
u'/Users/sergulaydore/python_for_data_analysis'