path = 'ch02/usagov_bitly_data2012-03-16-1331923249.txt'
open(path).readline()
'{ "a": "Mozilla\\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\\/535.11 (KHTML, like Gecko) Chrome\\/17.0.963.78 Safari\\/535.11", "c": "US", "nk": 1, "tz": "America\\/New_York", "gr": "MA", "g": "A6qOVH", "h": "wfLQtf", "l": "orofrog", "al": "en-US,en;q=0.8", "hh": "1.usa.gov", "r": "http:\\/\\/www.facebook.com\\/l\\/7AQEFzjSi\\/1.usa.gov\\/wfLQtf", "u": "http:\\/\\/www.ncbi.nlm.nih.gov\\/pubmed\\/22415991", "t": 1331923247, "hc": 1331822918, "cy": "Danvers", "ll": [ 42.576698, -70.954903 ] }\n'
import json
records = [json.loads(line) for line in open(path)]
records[0]
{u'a': u'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.78 Safari/535.11', u'al': u'en-US,en;q=0.8', u'c': u'US', u'cy': u'Danvers', u'g': u'A6qOVH', u'gr': u'MA', u'h': u'wfLQtf', u'hc': 1331822918, u'hh': u'1.usa.gov', u'l': u'orofrog', u'll': [42.576698, -70.954903], u'nk': 1, u'r': u'http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/wfLQtf', u't': 1331923247, u'tz': u'America/New_York', u'u': u'http://www.ncbi.nlm.nih.gov/pubmed/22415991'}
records[0]['tz']
u'America/New_York'
time_zones = [rec['tz'] for rec in records]
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) <ipython-input-343-db4fbd348da9> in <module>() ----> 1 time_zones = [rec['tz'] for rec in records] KeyError: 'tz'
time_zones = [rec['tz'] for rec in records if 'tz' in rec]
def get_counts(sequence):
counts = {}
for x in sequence:
if x in counts:
counts[x] += 1
else:
counts[x] = 1
return counts
def get_counts2(sequence):
counts = defaultdict(int) # 기본 값이 0으로 초기화 됩니다.
for x in sequence:
counts[x] += 1
return counts
counts = get_counts(time_zones)
counts['America/New_York']
1251
len(time_zones)
3440
def top_counts(count_dict, n=10):
value_key_pairs = [(count, tz) for tz, count in count_dict.items()]
value_key_pairs.sort()
return value_key_pairs[-n:]
top_counts(counts)
[(33, u'America/Sao_Paulo'), (35, u'Europe/Madrid'), (36, u'Pacific/Honolulu'), (37, u'Asia/Tokyo'), (74, u'Europe/London'), (191, u'America/Denver'), (382, u'America/Los_Angeles'), (400, u'America/Chicago'), (521, u''), (1251, u'America/New_York')]
from collections import Counter
counter = Counter(time_zones)
counter.most_common(10)
[(u'America/New_York', 1251), (u'', 521), (u'America/Chicago', 400), (u'America/Los_Angeles', 382), (u'America/Denver', 191), (u'Europe/London', 74), (u'Asia/Tokyo', 37), (u'Pacific/Honolulu', 36), (u'Europe/Madrid', 35), (u'America/Sao_Paulo', 33)]
from pandas import DataFrame, Series
import pandas as pd; import numpy as np
frame = DataFrame(records)
frame
<class 'pandas.core.frame.DataFrame'> Int64Index: 3560 entries, 0 to 3559 Data columns (total 18 columns): _heartbeat_ 120 non-null values a 3440 non-null values al 3094 non-null values c 2919 non-null values cy 2919 non-null values g 3440 non-null values gr 2919 non-null values h 3440 non-null values hc 3440 non-null values hh 3440 non-null values kw 93 non-null values l 3440 non-null values ll 2919 non-null values nk 3440 non-null values r 3440 non-null values t 3440 non-null values tz 3440 non-null values u 3440 non-null values dtypes: float64(4), object(14)
frame['tz'][:10]
0 America/New_York 1 America/Denver 2 America/New_York 3 America/Sao_Paulo 4 America/New_York 5 America/New_York 6 Europe/Warsaw 7 8 9 Name: tz, dtype: object
tz_counts = frame['tz'].value_counts()
tz_counts[:10]
America/New_York 1251 521 America/Chicago 400 America/Los_Angeles 382 America/Denver 191 Europe/London 74 Asia/Tokyo 37 Pacific/Honolulu 36 Europe/Madrid 35 America/Sao_Paulo 33 dtype: int64
clean_tz = frame['tz'].fillna('Missing')
clean_tz[clean_tz == ''] = 'Unknown'
tz_counts = clean_tz.value_counts()
tz_counts[:10]
America/New_York 1251 Unknown 521 America/Chicago 400 America/Los_Angeles 382 America/Denver 191 Missing 120 Europe/London 74 Asia/Tokyo 37 Pacific/Honolulu 36 Europe/Madrid 35 dtype: int64
tz_counts[:10].plot(kind='barh', rot=0)
<matplotlib.axes.AxesSubplot at 0x11f7165d0>
frame['a'][1]
u'GoogleMaps/RochesterNY'
frame['a'][50]
u'Mozilla/5.0 (Windows NT 5.1; rv:10.0.2) Gecko/20100101 Firefox/10.0.2'
frame['a'][51]
u'Mozilla/5.0 (Linux; U; Android 2.2.2; en-us; LG-P925/V10e Build/FRG83G) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1'
results = Series([x.split()[0] for x in frame.a.dropna()])
results[:5]
0 Mozilla/5.0 1 GoogleMaps/RochesterNY 2 Mozilla/4.0 3 Mozilla/5.0 4 Mozilla/5.0 dtype: object
results.value_counts()[:8]
Mozilla/5.0 2594 Mozilla/4.0 601 GoogleMaps/RochesterNY 121 Opera/9.80 34 TEST_INTERNET_AGENT 24 GoogleProducer 21 Mozilla/6.0 5 BlackBerry8520/5.0.0.681 4 dtype: int64
cframe = frame[frame.a.notnull()] # 에이전트 정보가 있는 것만 추려냄
operating_system = np.where(cframe['a'].str.contains('Windows'),
'Windows', 'Not Windows')
operating_system[:5]
0 Windows 1 Not Windows 2 Windows 3 Not Windows 4 Windows Name: a, dtype: object
by_tz_os = cframe.groupby(['tz', operating_system])
size()로 개수를 세고, unstack으로 옆으로 펴줍니다.
agg_counts = by_tz_os.size().unstack().fillna(0)
agg_counts[:10]
a | Not Windows | Windows |
---|---|---|
tz | ||
245 | 276 | |
Africa/Cairo | 0 | 3 |
Africa/Casablanca | 0 | 1 |
Africa/Ceuta | 0 | 2 |
Africa/Johannesburg | 0 | 1 |
Africa/Lusaka | 0 | 1 |
America/Anchorage | 4 | 1 |
America/Argentina/Buenos_Aires | 1 | 0 |
America/Argentina/Cordoba | 0 | 1 |
America/Argentina/Mendoza | 0 | 1 |
indexer = agg_counts.sum(1).argsort()
indexer[:10]
tz 24 Africa/Cairo 20 Africa/Casablanca 21 Africa/Ceuta 92 Africa/Johannesburg 87 Africa/Lusaka 53 America/Anchorage 54 America/Argentina/Buenos_Aires 57 America/Argentina/Cordoba 26 America/Argentina/Mendoza 55 dtype: int64
count_subset = agg_counts.take(indexer)[-10:]
count_subset
a | Not Windows | Windows |
---|---|---|
tz | ||
America/Sao_Paulo | 13 | 20 |
Europe/Madrid | 16 | 19 |
Pacific/Honolulu | 0 | 36 |
Asia/Tokyo | 2 | 35 |
Europe/London | 43 | 31 |
America/Denver | 132 | 59 |
America/Los_Angeles | 130 | 252 |
America/Chicago | 115 | 285 |
245 | 276 | |
America/New_York | 339 | 912 |
count_subset.plot(kind='barh', stacked=True)
<matplotlib.axes.AxesSubplot at 0x127e30910>
normed_subset = count_subset.div(count_subset.sum(1), axis=0)
normed_subset.plot(kind='barh', stacked=True)
<matplotlib.axes.AxesSubplot at 0x127e71410>
!head ch02/movielens/movies.dat
1::Toy Story (1995)::Animation|Children's|Comedy 2::Jumanji (1995)::Adventure|Children's|Fantasy 3::Grumpier Old Men (1995)::Comedy|Romance 4::Waiting to Exhale (1995)::Comedy|Drama 5::Father of the Bride Part II (1995)::Comedy 6::Heat (1995)::Action|Crime|Thriller 7::Sabrina (1995)::Comedy|Romance 8::Tom and Huck (1995)::Adventure|Children's 9::Sudden Death (1995)::Action 10::GoldenEye (1995)::Action|Adventure|Thriller
import pandas as pd
unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_table('ch02/movielens/users.dat', sep='::', header=None,
names=unames)
rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table('ch02/movielens/ratings.dat', sep='::', header=None,
names=rnames)
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('ch02/movielens/movies.dat', sep='::', header=None,
names=mnames)
users[:5]
user_id | gender | age | occupation | zip | |
---|---|---|---|---|---|
0 | 1 | F | 1 | 10 | 48067 |
1 | 2 | M | 56 | 16 | 70072 |
2 | 3 | M | 25 | 15 | 55117 |
3 | 4 | M | 45 | 7 | 02460 |
4 | 5 | M | 25 | 20 | 55455 |
ratings[:5]
user_id | movie_id | rating | timestamp | |
---|---|---|---|---|
0 | 1 | 1193 | 5 | 978300760 |
1 | 1 | 661 | 3 | 978302109 |
2 | 1 | 914 | 3 | 978301968 |
3 | 1 | 3408 | 4 | 978300275 |
4 | 1 | 2355 | 5 | 978824291 |
movies[:5]
movie_id | title | genres | |
---|---|---|---|
0 | 1 | Toy Story (1995) | Animation|Children's|Comedy |
1 | 2 | Jumanji (1995) | Adventure|Children's|Fantasy |
2 | 3 | Grumpier Old Men (1995) | Comedy|Romance |
3 | 4 | Waiting to Exhale (1995) | Comedy|Drama |
4 | 5 | Father of the Bride Part II (1995) | Comedy |
data = pd.merge(pd.merge(ratings, users), movies)
data
<class 'pandas.core.frame.DataFrame'> Int64Index: 1000209 entries, 0 to 1000208 Data columns (total 10 columns): user_id 1000209 non-null values movie_id 1000209 non-null values rating 1000209 non-null values timestamp 1000209 non-null values gender 1000209 non-null values age 1000209 non-null values occupation 1000209 non-null values zip 1000209 non-null values title 1000209 non-null values genres 1000209 non-null values dtypes: int64(6), object(4)
data.ix[0]
user_id 1 movie_id 1193 rating 5 timestamp 978300760 gender F age 1 occupation 10 zip 48067 title One Flew Over the Cuckoo's Nest (1975) genres Drama Name: 0, dtype: object
mean_ratings = data.pivot_table('rating', rows='title',
cols='gender', aggfunc='mean')
mean_ratings[:5]
gender | F | M |
---|---|---|
title | ||
$1,000,000 Duck (1971) | 3.375000 | 2.761905 |
'Night Mother (1986) | 3.388889 | 3.352941 |
'Til There Was You (1997) | 2.675676 | 2.733333 |
'burbs, The (1989) | 2.793478 | 2.962085 |
...And Justice for All (1979) | 3.828571 | 3.689024 |
ratings_by_title = data.groupby('title').size()
ratings_by_title[:10]
title $1,000,000 Duck (1971) 37 'Night Mother (1986) 70 'Til There Was You (1997) 52 'burbs, The (1989) 303 ...And Justice for All (1979) 199 1-900 (1994) 2 10 Things I Hate About You (1999) 700 101 Dalmatians (1961) 565 101 Dalmatians (1996) 364 12 Angry Men (1957) 616 dtype: int64
active_titles = ratings_by_title.index[ratings_by_title >= 250]
mean_ratings = mean_ratings.ix[active_titles]
mean_ratings
<class 'pandas.core.frame.DataFrame'> Index: 1216 entries, 'burbs, The (1989) to eXistenZ (1999) Data columns (total 2 columns): F 1216 non-null values M 1216 non-null values dtypes: float64(2)
top_female_ratings = mean_ratings.sort_index(by='F', ascending=False)
top_female_ratings[:10]
gender | F | M |
---|---|---|
title | ||
Close Shave, A (1995) | 4.644444 | 4.473795 |
Wrong Trousers, The (1993) | 4.588235 | 4.478261 |
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950) | 4.572650 | 4.464589 |
Wallace & Gromit: The Best of Aardman Animation (1996) | 4.563107 | 4.385075 |
Schindler's List (1993) | 4.562602 | 4.491415 |
Shawshank Redemption, The (1994) | 4.539075 | 4.560625 |
Grand Day Out, A (1992) | 4.537879 | 4.293255 |
To Kill a Mockingbird (1962) | 4.536667 | 4.372611 |
Creature Comforts (1990) | 4.513889 | 4.272277 |
Usual Suspects, The (1995) | 4.513317 | 4.518248 |
mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F']
sorted_by_diff = mean_ratings.sort_index(by='diff')
sorted_by_diff[:15]
gender | F | M | diff |
---|---|---|---|
title | |||
Dirty Dancing (1987) | 3.790378 | 2.959596 | -0.830782 |
Jumpin' Jack Flash (1986) | 3.254717 | 2.578358 | -0.676359 |
Grease (1978) | 3.975265 | 3.367041 | -0.608224 |
Little Women (1994) | 3.870588 | 3.321739 | -0.548849 |
Steel Magnolias (1989) | 3.901734 | 3.365957 | -0.535777 |
Anastasia (1997) | 3.800000 | 3.281609 | -0.518391 |
Rocky Horror Picture Show, The (1975) | 3.673016 | 3.160131 | -0.512885 |
Color Purple, The (1985) | 4.158192 | 3.659341 | -0.498851 |
Age of Innocence, The (1993) | 3.827068 | 3.339506 | -0.487561 |
Free Willy (1993) | 2.921348 | 2.438776 | -0.482573 |
French Kiss (1995) | 3.535714 | 3.056962 | -0.478752 |
Little Shop of Horrors, The (1960) | 3.650000 | 3.179688 | -0.470312 |
Guys and Dolls (1955) | 4.051724 | 3.583333 | -0.468391 |
Mary Poppins (1964) | 4.197740 | 3.730594 | -0.467147 |
Patch Adams (1998) | 3.473282 | 3.008746 | -0.464536 |
sorted_by_diff[::-1][:15]
gender | F | M | diff |
---|---|---|---|
title | |||
Good, The Bad and The Ugly, The (1966) | 3.494949 | 4.221300 | 0.726351 |
Kentucky Fried Movie, The (1977) | 2.878788 | 3.555147 | 0.676359 |
Dumb & Dumber (1994) | 2.697987 | 3.336595 | 0.638608 |
Longest Day, The (1962) | 3.411765 | 4.031447 | 0.619682 |
Cable Guy, The (1996) | 2.250000 | 2.863787 | 0.613787 |
Evil Dead II (Dead By Dawn) (1987) | 3.297297 | 3.909283 | 0.611985 |
Hidden, The (1987) | 3.137931 | 3.745098 | 0.607167 |
Rocky III (1982) | 2.361702 | 2.943503 | 0.581801 |
Caddyshack (1980) | 3.396135 | 3.969737 | 0.573602 |
For a Few Dollars More (1965) | 3.409091 | 3.953795 | 0.544704 |
Porky's (1981) | 2.296875 | 2.836364 | 0.539489 |
Animal House (1978) | 3.628906 | 4.167192 | 0.538286 |
Exorcist, The (1973) | 3.537634 | 4.067239 | 0.529605 |
Fright Night (1985) | 2.973684 | 3.500000 | 0.526316 |
Barb Wire (1996) | 1.585366 | 2.100386 | 0.515020 |
# 타이틀 별 평점의 표준편차
rating_std_by_title = data.groupby('title')['rating'].std()
# 유효한 타이틀에 대해 좁힌다
rating_std_by_title = rating_std_by_title.ix[active_titles]
# 상위 10개만 본다
rating_std_by_title.order(ascending=False)[:10]
title Dumb & Dumber (1994) 1.321333 Blair Witch Project, The (1999) 1.316368 Natural Born Killers (1994) 1.307198 Tank Girl (1995) 1.277695 Rocky Horror Picture Show, The (1975) 1.260177 Eyes Wide Shut (1999) 1.259624 Evita (1996) 1.253631 Billy Madison (1995) 1.249970 Fear and Loathing in Las Vegas (1998) 1.246408 Bicentennial Man (1999) 1.245533 Name: rating, dtype: float64
이제, 미국 사회보장국(SSA)가 제공하는 1880년부터 현재까지 출생하는 아기들의 이름 데이터를 분석해보겠습니다.
!head ch02/names/yob1880.txt
import pandas as pd
names1880 = pd.read_csv('ch02/names/yob1880.txt', names=['name', 'sex', 'births'])
names1880
<class 'pandas.core.frame.DataFrame'> Int64Index: 2000 entries, 0 to 1999 Data columns (total 3 columns): name 2000 non-null values sex 2000 non-null values births 2000 non-null values dtypes: int64(1), object(2)
names1880.groupby('sex').births.sum()
sex F 90993 M 110493 Name: births, dtype: int64
years = range(1880, 2011)
pieces = []
columns = ['name', 'sex', 'births']
for year in years:
path = 'ch02/names/yob%d.txt' % year
frame = pd.read_csv(path, names=columns)
frame['year'] = year
pieces.append(frame)
# 모든 데이터프레임들을 합친다.
names = pd.concat(pieces, ignore_index=True)
names
<class 'pandas.core.frame.DataFrame'> Int64Index: 1690784 entries, 0 to 1690783 Data columns (total 4 columns): name 1690784 non-null values sex 1690784 non-null values births 1690784 non-null values year 1690784 non-null values dtypes: int64(2), object(2)
total_births = names.pivot_table('births', rows='year',
cols='sex', aggfunc=sum)
total_births.tail()
sex | F | M |
---|---|---|
year | ||
2006 | 1896468 | 2050234 |
2007 | 1916888 | 2069242 |
2008 | 1883645 | 2032310 |
2009 | 1827643 | 1973359 |
2010 | 1759010 | 1898382 |
def add_prop(group):
# 실수형으로 나눕니다.
births = group.births.astype(float)
group['prop'] = births / births.sum()
return group
names = names.groupby(['year', 'sex']).apply(add_prop)
names
<class 'pandas.core.frame.DataFrame'> Int64Index: 1690784 entries, 0 to 1690783 Data columns (total 5 columns): name 1690784 non-null values sex 1690784 non-null values births 1690784 non-null values year 1690784 non-null values prop 1690784 non-null values dtypes: float64(1), int64(2), object(2)
names[:6]
name | sex | births | year | prop | |
---|---|---|---|---|---|
0 | Mary | F | 7065 | 1880 | 0.077643 |
1 | Anna | F | 2604 | 1880 | 0.028618 |
2 | Emma | F | 2003 | 1880 | 0.022013 |
3 | Elizabeth | F | 1939 | 1880 | 0.021309 |
4 | Minnie | F | 1746 | 1880 | 0.019188 |
5 | Margaret | F | 1578 | 1880 | 0.017342 |
np.allclose(names.groupby(['year', 'sex']).prop.sum(), 1)
True
def get_top1000(group):
return group.sort_index(by='births', ascending=False)[:1000]
grouped = names.groupby(['year', 'sex'])
top1000 = grouped.apply(get_top1000)
boys = top1000[top1000.sex == 'M']
girls = top1000[top1000.sex == 'F']
total_births = top1000.pivot_table('births', rows='year', cols='name',
aggfunc=sum)
total_births
<class 'pandas.core.frame.DataFrame'> Int64Index: 131 entries, 1880 to 2010 Columns: 6865 entries, Aaden to Zuri dtypes: float64(6865)
subset = total_births[['John', 'Harry', 'Mary', 'Marilyn']]
subset.plot(subplots=True, figsize=(12, 10), grid=False,
title="Number of births per year")
array([<matplotlib.axes.AxesSubplot object at 0x127d56dd0>, <matplotlib.axes.AxesSubplot object at 0x117ae1850>, <matplotlib.axes.AxesSubplot object at 0x127c61450>, <matplotlib.axes.AxesSubplot object at 0x127c7f410>], dtype=object)
table = top1000.pivot_table('prop', rows='year',
cols='sex', aggfunc=sum)
table.plot(title=u'Sum of table1000.prop by year and sex',
yticks=np.linspace(0, 1.2, 13), xticks=range(1880, 2020, 10))
<matplotlib.axes.AxesSubplot at 0x13ad505d0>
df = boys[boys.year == 2010]
df
<class 'pandas.core.frame.DataFrame'> MultiIndex: 1000 entries, (2010, M, 1676644) to (2010, M, 1677643) Data columns (total 5 columns): name 1000 non-null values sex 1000 non-null values births 1000 non-null values year 1000 non-null values prop 1000 non-null values dtypes: float64(1), int64(2), object(2)
prop_cumsum = df.sort_index(by='prop', ascending=False).prop.cumsum()
prop_cumsum[:10]
year sex 2010 M 1676644 0.011523 1676645 0.020934 1676646 0.029959 1676647 0.038930 1676648 0.047817 1676649 0.056579 1676650 0.065155 1676651 0.073414 1676652 0.081528 1676653 0.089621 dtype: float64
prop_cumsum.searchsorted(0.5) + 1
117
df = boys[boys.year == 1900]
in1900 = df.sort_index(by='prop', ascending=False).prop.cumsum()
in1900.searchsorted(0.5) + 1
25
def get_quantile_count(group, q=0.5):
group = group.sort_index(by='prop', ascending=False)
return group.prop.cumsum().searchsorted(q) + 1
diversity = top1000.groupby(['year', 'sex']).apply(get_quantile_count)
diversity = diversity.unstack('sex')
diversity.head()
sex | F | M |
---|---|---|
year | ||
1880 | 38 | 14 |
1881 | 38 | 14 |
1882 | 38 | 15 |
1883 | 39 | 15 |
1884 | 39 | 16 |
diversity.plot(title="Number of popular names in top 50%")
<matplotlib.axes.AxesSubplot at 0x117cb4190>
# 이름 컬럼에서 마지막 글자 추출
get_last_letter = lambda x: x[-1]
last_letters = names.name.map(get_last_letter)
last_letters.name = 'last_letter'
table = names.pivot_table('births', rows=last_letters, cols=['sex', 'year'], aggfunc=sum)
subtable = table.reindex(columns=[1910, 1960, 2010], level='year')
subtable.head()
sex | F | M | ||||
---|---|---|---|---|---|---|
year | 1910 | 1960 | 2010 | 1910 | 1960 | 2010 |
last_letter | ||||||
a | 108376 | 691247 | 670605 | 977 | 5204 | 28438 |
b | NaN | 694 | 450 | 411 | 3912 | 38859 |
c | 5 | 49 | 946 | 482 | 15476 | 23125 |
d | 6750 | 3729 | 2607 | 22111 | 262112 | 44398 |
e | 133569 | 435013 | 313833 | 28655 | 178823 | 129012 |
subtable.sum()
sex year F 1910 396416 1960 2022062 2010 1759010 M 1910 194198 1960 2132588 2010 1898382 dtype: float64
letter_prop = subtable / subtable.sum().astype(float)
import matplotlib.pyplot as plt
fig, axes = plt.subplots(2, 1, figsize=(10, 8))
letter_prop['M'].plot(kind='bar', rot=0, ax=axes[0], title='Male')
letter_prop['F'].plot(kind='bar', rot=0, ax=axes[1], title='Female',
legend=False)
<matplotlib.axes.AxesSubplot at 0x11f8eb190>
letter_prop = table / table.sum().astype(float)
dny_ts = letter_prop.ix[['d', 'n', 'y'], 'M'].T
dny_ts.head()
d | n | y | |
---|---|---|---|
year | |||
1880 | 0.083055 | 0.153213 | 0.075760 |
1881 | 0.083247 | 0.153214 | 0.077451 |
1882 | 0.085340 | 0.149560 | 0.077537 |
1883 | 0.084066 | 0.151646 | 0.079144 |
1884 | 0.086120 | 0.149915 | 0.080405 |
dny_ts.plot(figsize=(10, 5))
<matplotlib.axes.AxesSubplot at 0x117c9c790>
all_names = top1000.name.unique()
mask = np.array(['lesl' in x.lower() for x in all_names])
lesley_like = all_names[mask]
lesley_like
array(['Leslie', 'Lesley', 'Leslee', 'Lesli', 'Lesly'], dtype=object)
mask
array([False, False, False, ..., False, False, False], dtype=bool)
filtered = top1000[top1000.name.isin(lesley_like)]
filtered.groupby('name').births.sum()
name Leslee 1082 Lesley 35022 Lesli 929 Leslie 370429 Lesly 10067 Name: births, dtype: int64
table = filtered.pivot_table('births', rows='year', cols='sex', aggfunc='sum')
table = table.div(table.sum(1), axis=0)
table.tail()
sex | F | M |
---|---|---|
year | ||
2006 | 1 | NaN |
2007 | 1 | NaN |
2008 | 1 | NaN |
2009 | 1 | NaN |
2010 | 1 | NaN |
table.plot(style={'M': 'k-', 'F': 'k--'}, figsize=(10, 5))
<matplotlib.axes.AxesSubplot at 0x13addb350>
import numpy as np
import pandas as pd