import pandas as pd
import numpy as np
pd.__version__
'0.11.0'
values = [1.5, 5, -5, 4, 2]
s = pd.Series(values)
s
0 1.5 1 5.0 2 -5.0 3 4.0 4 2.0 dtype: float64
s.values
array([ 1.5, 5. , -5. , 4. , 2. ])
s.index
Int64Index([0, 1, 2, 3, 4], dtype=int64)
labels = ['a', 'b', 'c', 'd', 'e']
s2 = pd.Series(values, index=labels)
s2
a 1.5 b 5.0 c -5.0 d 4.0 e 2.0 dtype: float64
s2.index
Index([a, b, c, d, e], dtype=object)
s2.index[0]
'a'
s2[2]
-5.0
s2['c']
-5.0
s2[:3]
a 1.5 b 5.0 c -5.0 dtype: float64
s2[['a', 'c', 'd']]
a 1.5 c -5.0 d 4.0 dtype: float64
# missing data
s2[['a', 'c', 'd', 'f']]
a 1.5 c -5.0 d 4.0 f NaN dtype: float64
s3 = s2[['a', 'c', 'd', 'f']]
s3[s3.isnull()]
f NaN dtype: float64
s3[s3.notnull()]
a 1.5 c -5.0 d 4.0 dtype: float64
s3.dropna()
a 1.5 c -5.0 d 4.0 dtype: float64
s3.fillna(0)
a 1.5 c -5.0 d 4.0 f 0.0 dtype: float64
s3.mean()
0.16666666666666666
s3.fillna(s3.mean())
a 1.500000 c -5.000000 d 4.000000 f 0.166667 dtype: float64
data = {
'two': s2,
'three': s3.fillna(s3.mean())
}
df = pd.DataFrame(data)
df
three | two | |
---|---|---|
a | 1.500000 | 1.5 |
b | NaN | 5.0 |
c | -5.000000 | -5.0 |
d | 4.000000 | 4.0 |
e | NaN | 2.0 |
f | 0.166667 | NaN |
df.index.name = 'edition'
df.columns.name = 'number'
df
number | three | two |
---|---|---|
edition | ||
a | 1.500000 | 1.5 |
b | NaN | 5.0 |
c | -5.000000 | -5.0 |
d | 4.000000 | 4.0 |
e | NaN | 2.0 |
f | 0.166667 | NaN |
np.set_printoptions(precision=4)
np.random.randn(5, 3).T
array([[-0.5463, -0.3112, -0.9632, 1.8207, 2.6747], [ 0.3955, 0.8636, -0.0378, -0.3928, -0.4491], [-0.4356, 0.9094, -0.9029, -2.0247, 0.2625]])
df.T
edition | a | b | c | d | e | f |
---|---|---|---|---|---|---|
number | ||||||
three | 1.5 | NaN | -5 | 4 | NaN | 0.166667 |
two | 1.5 | 5 | -5 | 4 | 2 | NaN |
df['three']
edition a 1.500000 b NaN c -5.000000 d 4.000000 e NaN f 0.166667 Name: three, dtype: float64
df
number | three | two |
---|---|---|
edition | ||
a | 1.500000 | 1.5 |
b | NaN | 5.0 |
c | -5.000000 | -5.0 |
d | 4.000000 | 4.0 |
e | NaN | 2.0 |
f | 0.166667 | NaN |
df2 = pd.DataFrame(data, columns=['one', 'two', 'three'])
df2
one | two | three | |
---|---|---|---|
a | NaN | 1.5 | 1.500000 |
b | NaN | 5.0 | NaN |
c | NaN | -5.0 | -5.000000 |
d | NaN | 4.0 | 4.000000 |
e | NaN | 2.0 | NaN |
f | NaN | NaN | 0.166667 |
df
number | three | two |
---|---|---|
edition | ||
a | 1.500000 | 1.5 |
b | NaN | 5.0 |
c | -5.000000 | -5.0 |
d | 4.000000 | 4.0 |
e | NaN | 2.0 |
f | 0.166667 | NaN |
df[['two', 'three']]
number | two | three |
---|---|---|
edition | ||
a | 1.5 | 1.500000 |
b | 5.0 | NaN |
c | -5.0 | -5.000000 |
d | 4.0 | 4.000000 |
e | 2.0 | NaN |
f | NaN | 0.166667 |
df.ix['c']
number three -5 two -5 Name: c, dtype: float64
df.ix['c':]
number | three | two |
---|---|---|
edition | ||
c | -5.000000 | -5 |
d | 4.000000 | 4 |
e | NaN | 2 |
f | 0.166667 | NaN |
df.ix[['c', 'e']]
number | three | two |
---|---|---|
c | -5 | -5 |
e | NaN | 2 |
df.ix[['c', 'e'], 'three']
c -5 e NaN Name: three, dtype: float64
df.ix[['c', 'e'], ['two', 'three']]
two | three | |
---|---|---|
c | -5 | -5 |
e | 2 | NaN |
df['four'] = df['two'] * 2
df
number | three | two | four |
---|---|---|---|
edition | |||
a | 1.500000 | 1.5 | 3 |
b | NaN | 5.0 | 10 |
c | -5.000000 | -5.0 | -10 |
d | 4.000000 | 4.0 | 8 |
e | NaN | 2.0 | 4 |
f | 0.166667 | NaN | NaN |
del df['four']
df
three | two | |
---|---|---|
edition | ||
a | 1.500000 | 1.5 |
b | NaN | 5.0 |
c | -5.000000 | -5.0 |
d | 4.000000 | 4.0 |
e | NaN | 2.0 |
f | 0.166667 | NaN |
df.three
edition a 1.500000 b NaN c -5.000000 d 4.000000 e NaN f 0.166667 Name: three, dtype: float64
df['three']
edition a 1.500000 b NaN c -5.000000 d 4.000000 e NaN f 0.166667 Name: three, dtype: float64
df['index'] = 0
df
three | two | index | |
---|---|---|---|
edition | |||
a | 1.500000 | 1.5 | 0 |
b | NaN | 5.0 | 0 |
c | -5.000000 | -5.0 | 0 |
d | 4.000000 | 4.0 | 0 |
e | NaN | 2.0 | 0 |
f | 0.166667 | NaN | 0 |
df.index
Index([a, b, c, d, e, f], dtype=object)
!head -n 5 PandasUsers.csv
Id,Reputation,CreationDate,DisplayName,LastAccessDate,WebsiteUrl,Location,AboutMe,Views,UpVotes,DownVotes,EmailHash,Age 1843099,100,2012-11-21 19:25:04,user1843099,2013-02-03 03:46:50,,,,0,16,0,17a9c54b937f8b505dcb3b0b5b34cbb6, 854739,766,2011-07-20 20:29:21,mikebmassey,2013-02-02 17:40:38,http://Itsprivate...,"Charlotte, NC","<p>I'm a analytics guy for a financial company. Passionate about new technology. Trying to simultaneously learn 5 different programming languages and failing at all of them...</p> ",0,242,3,61af09414341715d110f08b7505fe114,35.0 1479269,472,2012-06-25 07:10:33,dmvianna,2013-01-31 10:20:43,,"Melbourne, Australia","<p>Former neuroscientist now data analyst.</p>
askers = pd.read_csv('PandasUsers.csv')
questions = pd.read_csv('PandasPosts.csv')
helpers = pd.read_csv('PandasAnswerers.csv')
answers = pd.read_csv('PandasAnswers.csv')
askers
<class 'pandas.core.frame.DataFrame'> Int64Index: 490 entries, 0 to 489 Data columns (total 13 columns): Id 490 non-null values Reputation 490 non-null values CreationDate 490 non-null values DisplayName 490 non-null values LastAccessDate 490 non-null values WebsiteUrl 81 non-null values Location 143 non-null values AboutMe 105 non-null values Views 490 non-null values UpVotes 490 non-null values DownVotes 490 non-null values EmailHash 490 non-null values Age 91 non-null values dtypes: float64(1), int64(5), object(7)
askers.ix[100]
Id 1301710 Reputation 3110 CreationDate 2012-03-29 19:23:17 DisplayName bmu LastAccessDate 2013-02-02 22:18:29 WebsiteUrl NaN Location NaN AboutMe <p>My about me is currently blank.</p>\n Views 0 UpVotes 975 DownVotes 38 EmailHash c09e91f8758f4ea83dbaa5e199ebb71a Age NaN Name: 100, dtype: object
askers.ix[100]['CreationDate']
'2012-03-29 19:23:17'
questions
<class 'pandas.core.frame.DataFrame'> Int64Index: 952 entries, 0 to 951 Data columns (total 18 columns): Id 952 non-null values PostTypeId 952 non-null values AcceptedAnswerId 656 non-null values CreationDate 952 non-null values Score 952 non-null values ViewCount 952 non-null values Body 952 non-null values OwnerUserId 952 non-null values OwnerDisplayName 1 non-null values LastEditorUserId 438 non-null values LastEditDate 438 non-null values LastActivityDate 952 non-null values Title 952 non-null values Tags 952 non-null values AnswerCount 903 non-null values CommentCount 438 non-null values FavoriteCount 303 non-null values ClosedDate 21 non-null values dtypes: float64(5), int64(5), object(8)
answers
<class 'pandas.core.frame.DataFrame'> Int64Index: 1331 entries, 0 to 1330 Data columns (total 13 columns): Id 1331 non-null values PostTypeId 1331 non-null values ParentId 1331 non-null values CreationDate 1331 non-null values Score 1331 non-null values Body 1331 non-null values OwnerUserId 1329 non-null values OwnerDisplayName 2 non-null values LastEditorUserId 366 non-null values LastEditorDisplayName 2 non-null values LastEditDate 368 non-null values LastActivityDate 1331 non-null values CommentCount 804 non-null values dtypes: float64(3), int64(4), object(6)
askers.DisplayName
0 user1843099 1 mikebmassey 2 dmvianna 3 ezbentley 4 Ralph 5 Evan Davey 6 Nicola Vianello 7 Roger 8 Shane 9 roysc 10 RJCL 11 PhE 12 Martin C. Martin 13 Einar 14 javier ... 475 briant57 476 user1470604 477 cpcloud 478 mhubig 479 akiladila 480 user1246428 481 Shakti 482 PlagTag 483 user14121 484 Tony 485 GBadge 486 Skylar Saveland 487 sashkello 488 user1498485 489 A Alstone Name: DisplayName, Length: 490, dtype: object
asker_count = askers.DisplayName.value_counts()
helper_count = helpers.DisplayName.value_counts()
ask_and_answer = asker_count.index.intersection(helper_count.index)
len(ask_and_answer)
118
len(askers)
490
len(helpers)
319
users = askers.append(helpers, ignore_index=True)
users
<class 'pandas.core.frame.DataFrame'> Int64Index: 809 entries, 0 to 808 Data columns (total 13 columns): Id 809 non-null values Reputation 809 non-null values CreationDate 809 non-null values DisplayName 809 non-null values LastAccessDate 809 non-null values WebsiteUrl 195 non-null values Location 298 non-null values AboutMe 240 non-null values Views 809 non-null values UpVotes 809 non-null values DownVotes 809 non-null values EmailHash 809 non-null values Age 200 non-null values dtypes: float64(1), int64(5), object(7)
users.DisplayName.duplicated().sum()
120
-users.Id.duplicated()
0 True 1 True 2 True 3 True 4 True 5 True 6 True 7 True 8 True 9 True 10 True 11 True 12 True 13 True 14 True ... 794 True 795 True 796 False 797 True 798 True 799 True 800 True 801 True 802 True 803 True 804 True 805 False 806 True 807 False 808 True Name: Id, Length: 809, dtype: bool
# converts to distinct users
users = users[-users.Id.duplicated()]
askers = askers.set_index('Id')
helpers = helpers.set_index('Id')
len(askers.index.intersection(helpers.index))
115
users
<class 'pandas.core.frame.DataFrame'> Int64Index: 694 entries, 0 to 808 Data columns (total 13 columns): Id 694 non-null values Reputation 694 non-null values CreationDate 694 non-null values DisplayName 694 non-null values LastAccessDate 694 non-null values WebsiteUrl 167 non-null values Location 255 non-null values AboutMe 202 non-null values Views 694 non-null values UpVotes 694 non-null values DownVotes 694 non-null values EmailHash 694 non-null values Age 170 non-null values dtypes: float64(1), int64(5), object(7)
questions
<class 'pandas.core.frame.DataFrame'> Int64Index: 952 entries, 0 to 951 Data columns (total 18 columns): Id 952 non-null values PostTypeId 952 non-null values AcceptedAnswerId 656 non-null values CreationDate 952 non-null values Score 952 non-null values ViewCount 952 non-null values Body 952 non-null values OwnerUserId 952 non-null values OwnerDisplayName 1 non-null values LastEditorUserId 438 non-null values LastEditDate 438 non-null values LastActivityDate 952 non-null values Title 952 non-null values Tags 952 non-null values AnswerCount 903 non-null values CommentCount 438 non-null values FavoriteCount 303 non-null values ClosedDate 21 non-null values dtypes: float64(5), int64(5), object(8)
questions = pd.merge(questions, users,
left_on='OwnerUserId',
right_on='Id',
suffixes=['_post', '_user'])
answers = pd.merge(answers, users,
left_on='OwnerUserId',
right_on='Id',
suffixes=['_post', '_user'])
column_mapping = {
'CreationDate_post': 'PostTime',
'CreationDate_user': 'UserJoinTime'
}
questions = questions.rename(columns=column_mapping)
answers = answers.rename(columns=column_mapping)
questions.PostTime[0]
'2012-11-19 02:12:10'
pd.to_datetime('2012-11-19 02:12:10')
datetime.datetime(2012, 11, 19, 2, 12, 10)
pd.to_datetime('11/12/2012', dayfirst=True)
datetime.datetime(2012, 12, 11, 0, 0)
pd.to_datetime('11/12/12', format='%d/%m/%y')
datetime.datetime(2012, 11, 12, 0, 0)
questions.PostTime.dtype
dtype('O')
pd.to_datetime(questions.PostTime)
0 2012-11-19 02:12:10 1 2012-11-20 17:22:04 2 2012-12-30 17:19:57 3 2012-12-30 22:43:48 4 2012-12-31 20:22:16 5 2012-11-19 00:04:20 6 2012-11-08 22:04:48 7 2012-11-17 15:59:38 8 2012-11-19 16:15:37 9 2012-11-21 11:23:14 10 2012-11-19 13:15:45 11 2013-01-06 08:44:13 12 2012-08-29 21:47:00 13 2012-09-03 15:16:55 14 2012-09-11 10:58:50 ... 937 2012-11-11 13:26:27 938 2012-11-13 02:18:10 939 2012-11-08 01:51:17 940 2012-11-08 23:33:25 941 2012-11-14 23:29:30 942 2012-11-14 12:04:13 943 2012-11-15 19:11:57 944 2012-11-16 17:57:11 945 2012-11-16 17:58:23 946 2012-11-15 20:27:19 947 2012-11-16 12:10:27 948 2012-11-18 22:14:06 949 2012-11-14 22:14:50 950 2012-11-17 19:34:36 951 2012-11-18 22:22:39 Name: PostTime, Length: 952, dtype: datetime64[ns]
pd.to_datetime(questions.PostTime)[0]
<Timestamp: 2012-11-19 02:12:10>
stamp = pd.to_datetime(questions.PostTime)[0]
stamp
<Timestamp: 2012-11-19 02:12:10>
stamp.tz_localize('utc')
<Timestamp: 2012-11-19 02:12:10+0000 UTC, tz=UTC>
stamp.tz_localize('utc').tz_convert('Asia/Singapore')
<Timestamp: 2012-11-19 10:12:10+0800 SGT, tz=Asia/Singapore>
questions['PostTime'] = pd.to_datetime(questions.PostTime)
answers['PostTime'] = pd.to_datetime(answers.PostTime)
answers.PostTime[5]
<Timestamp: 2012-11-07 01:23:02>
# you can do this in read_csv
posts2 = pd.read_csv('PandasPosts.csv', parse_dates=['CreationDate'])
posts2.CreationDate[0]
<Timestamp: 2012-11-19 02:12:10>
questions.sort('ViewCount', ascending=False).ViewCount
444 6454 490 4228 615 3238 486 2913 515 2624 443 2484 672 2422 849 2066 612 2052 504 2035 681 1992 497 1884 269 1817 582 1777 550 1768 ... 25 27 228 27 407 26 440 25 422 24 394 24 63 23 123 23 324 22 501 21 65 21 433 21 64 19 459 19 481 5 Name: ViewCount, Length: 952, dtype: int64
top10 = questions.sort('ViewCount', ascending=False)[:10]
top10[['Title', 'ViewCount']]
Title | ViewCount | |
---|---|---|
444 | Tutorial on PANDAS and PYTABLES | 6454 |
490 | What is the most efficient way to loop through... | 4228 |
615 | add one row in a pandas.DataFrame | 3238 |
486 | append two data frame with pandas | 2913 |
515 | Add indexed column to DataFrame with pandas | 2624 |
443 | Sort a pandas DataMatrix in ascending order | 2484 |
672 | Renaming columns in pandas | 2422 |
849 | How to fix Python Numpy/Pandas installation? | 2066 |
612 | iterating row by row through a pandas dataframe | 2052 |
504 | How to get the correlation between two timeser... | 2035 |
views = questions.ViewCount.values
views = pd.Series(views, index=questions.PostTime)
views
PostTime 2012-11-19 02:12:10 74 2012-11-20 17:22:04 107 2012-12-30 17:19:57 65 2012-12-30 22:43:48 53 2012-12-31 20:22:16 48 2012-11-19 00:04:20 71 2012-11-08 22:04:48 64 2012-11-17 15:59:38 238 2012-11-19 16:15:37 31 2012-11-21 11:23:14 44 2012-11-19 13:15:45 158 2013-01-06 08:44:13 67 2012-08-29 21:47:00 266 2012-09-03 15:16:55 365 2012-09-11 10:58:50 127 ... 2012-11-11 13:26:27 114 2012-11-13 02:18:10 151 2012-11-08 01:51:17 92 2012-11-08 23:33:25 110 2012-11-14 23:29:30 135 2012-11-14 12:04:13 87 2012-11-15 19:11:57 140 2012-11-16 17:57:11 92 2012-11-16 17:58:23 65 2012-11-15 20:27:19 158 2012-11-16 12:10:27 125 2012-11-18 22:14:06 109 2012-11-14 22:14:50 40 2012-11-17 19:34:36 74 2012-11-18 22:22:39 154 Length: 952, dtype: int64
views.index
<class 'pandas.tseries.index.DatetimeIndex'> [2012-11-19 02:12:10, ..., 2012-11-18 22:22:39] Length: 952, Freq: None, Timezone: None
views.index[0]
<Timestamp: 2012-11-19 02:12:10>
views.index.year
array([2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2013, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2013, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2013, 2012, 2012, 2012, 2012, 2012, 2013, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2013, 2013, 2013, 2012, 2012, 2012, 2012, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2013, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2013, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2013, 2013, 2013, 2012, 2012, 2013, 2012, 2013, 2013, 2012, 2013, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2013, 2012, 2013, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2013, 2013, 2012, 2013, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2013, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2013, 2013, 2013, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2013, 2012, 2012, 2012, 2013, 2013, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2013, 2012, 2012, 2012, 2012, 2013, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2013, 2012, 2012, 2013, 2013, 2013, 2013, 2013, 2013, 2012, 2012, 2012, 2013, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2013, 2012, 2012, 2012, 2013, 2013, 2013, 2013, 2012, 2012, 2012, 2012, 2012, 2012, 2013, 2012, 2012, 2012, 2012, 2013, 2012, 2012, 2012, 2012, 2013, 2013, 2013, 2013, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2013, 2013, 2013, 2012, 2013, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2012, 2013, 2013, 2013, 2013, 2012, 2012, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2012, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2012, 2012, 2013, 2013, 2012, 2012, 2012, 2013, 2012, 2013, 2012, 2013, 2012, 2012, 2012, 2012, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2012, 2013, 2013, 2013, 2013, 2012, 2013, 2013, 2013, 2013, 2013, 2013, 2012, 2012, 2012, 2012, 2012, 2012, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2012, 2012, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2012, 2013, 2013, 2013, 2011, 2011, 2011, 2013, 2013, 2013, 2013, 2012, 2012, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2012, 2012, 2012, 2013, 2012, 2013, 2013, 2012, 2013, 2013, 2013, 2013, 2013, 2011, 2013, 2011, 2011, 2013, 2011, 2011, 2013, 2012, 2012, 2011, 2013, 2012, 2011, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2013, 2011, 2012, 2011, 2012, 2012, 2012, 2011, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012], dtype=int32)
year = views.index.year
month = views.index.month
views.groupby(year)
<pandas.core.groupby.SeriesGroupBy at 0x106ab0050>
sizes = views.groupby(year).size()
sizes
2011 13 2012 766 2013 173 dtype: int64
sums = views.groupby(year).sum()
sums
2011 25083 2012 208237 2013 10520 dtype: int64
reordered = sizes[[2013, 2012, 2011]]
sums / reordered
2011 1929 2012 271 2013 60 dtype: int64
print views.count() # excludes missing data (if any)
print len(views)
952 952
grouped = views.groupby(year)
grouped.agg(['count', 'sum', 'min', 'max', 'median', 'mean'])
count | sum | min | max | median | mean | |
---|---|---|---|---|---|---|
2011 | 13 | 25083 | 368 | 6454 | 1128 | 1929.461538 |
2012 | 766 | 208237 | 27 | 3238 | 143 | 271.849869 |
2013 | 173 | 10520 | 5 | 518 | 51 | 60.809249 |
views.order(ascending=False)[:5]
PostTime 2011-05-10 19:55:08 6454 2011-10-20 14:46:14 4228 2012-05-23 08:12:31 3238 2011-10-15 08:21:17 2913 2012-03-18 22:34:26 2624 dtype: int64
def topn(group, n=5):
return group.order(ascending=False)[:n]
grouped = views.groupby(year)
result = grouped.apply(topn)
result
PostTime 2011 2011-05-10 19:55:08 6454 2011-10-20 14:46:14 4228 2011-10-15 08:21:17 2913 2011-04-05 21:13:50 2484 2011-06-24 12:31:45 2035 2012 2012-05-23 08:12:31 3238 2012-03-18 22:34:26 2624 2012-07-05 14:21:15 2422 2012-09-15 11:30:35 2066 2012-05-23 23:21:18 2052 2013 2013-01-15 23:45:53 518 2013-01-07 15:42:28 459 2013-01-10 16:20:32 335 2013-01-11 18:18:42 184 2013-01-16 09:35:36 158 dtype: int64
result[2012]
PostTime 2012-05-23 08:12:31 3238 2012-03-18 22:34:26 2624 2012-07-05 14:21:15 2422 2012-09-15 11:30:35 2066 2012-05-23 23:21:18 2052 dtype: int64
result
PostTime 2011 2011-05-10 19:55:08 6454 2011-10-20 14:46:14 4228 2011-10-15 08:21:17 2913 2011-04-05 21:13:50 2484 2011-06-24 12:31:45 2035 2012 2012-05-23 08:12:31 3238 2012-03-18 22:34:26 2624 2012-07-05 14:21:15 2422 2012-09-15 11:30:35 2066 2012-05-23 23:21:18 2052 2013 2013-01-15 23:45:53 518 2013-01-07 15:42:28 459 2013-01-10 16:20:32 335 2013-01-11 18:18:42 184 2013-01-16 09:35:36 158 dtype: int64
year = views.index.year
month = views.index.month
keys = [year, month]
stats = ['count']
grouped = views.groupby(keys)
result = grouped.agg(stats)
result
count | ||
---|---|---|
2011 | 3 | 1 |
4 | 2 | |
5 | 1 | |
6 | 1 | |
9 | 1 | |
10 | 4 | |
11 | 2 | |
12 | 1 | |
2012 | 1 | 6 |
2 | 4 | |
3 | 29 | |
4 | 27 | |
5 | 44 | |
6 | 50 | |
7 | 67 | |
8 | 91 | |
9 | 94 | |
10 | 105 | |
11 | 121 | |
12 | 128 | |
2013 | 1 | 161 |
2 | 12 |
result.unstack(0)
count | |||
---|---|---|---|
2011 | 2012 | 2013 | |
1 | NaN | 6 | 161 |
2 | NaN | 4 | 12 |
3 | 1 | 29 | NaN |
4 | 2 | 27 | NaN |
5 | 1 | 44 | NaN |
6 | 1 | 50 | NaN |
7 | NaN | 67 | NaN |
8 | NaN | 91 | NaN |
9 | 1 | 94 | NaN |
10 | 4 | 105 | NaN |
11 | 2 | 121 | NaN |
12 | 1 | 128 | NaN |
result.unstack(0).fillna(0)
count | |||
---|---|---|---|
2011 | 2012 | 2013 | |
1 | 0 | 6 | 161 |
2 | 0 | 4 | 12 |
3 | 1 | 29 | 0 |
4 | 2 | 27 | 0 |
5 | 1 | 44 | 0 |
6 | 1 | 50 | 0 |
7 | 0 | 67 | 0 |
8 | 0 | 91 | 0 |
9 | 1 | 94 | 0 |
10 | 4 | 105 | 0 |
11 | 2 | 121 | 0 |
12 | 1 | 128 | 0 |
result
count | ||
---|---|---|
2011 | 3 | 1 |
4 | 2 | |
5 | 1 | |
6 | 1 | |
9 | 1 | |
10 | 4 | |
11 | 2 | |
12 | 1 | |
2012 | 1 | 6 |
2 | 4 | |
3 | 29 | |
4 | 27 | |
5 | 44 | |
6 | 50 | |
7 | 67 | |
8 | 91 | |
9 | 94 | |
10 | 105 | |
11 | 121 | |
12 | 128 | |
2013 | 1 | 161 |
2 | 12 |
result.index.names = ['year', 'month']
result
count | ||
---|---|---|
year | month | |
2011 | 3 | 1 |
4 | 2 | |
5 | 1 | |
6 | 1 | |
9 | 1 | |
10 | 4 | |
11 | 2 | |
12 | 1 | |
2012 | 1 | 6 |
2 | 4 | |
3 | 29 | |
4 | 27 | |
5 | 44 | |
6 | 50 | |
7 | 67 | |
8 | 91 | |
9 | 94 | |
10 | 105 | |
11 | 121 | |
12 | 128 | |
2013 | 1 | 161 |
2 | 12 |
result.unstack('year')
count | |||
---|---|---|---|
year | 2011 | 2012 | 2013 |
month | |||
1 | NaN | 6 | 161 |
2 | NaN | 4 | 12 |
3 | 1 | 29 | NaN |
4 | 2 | 27 | NaN |
5 | 1 | 44 | NaN |
6 | 1 | 50 | NaN |
7 | NaN | 67 | NaN |
8 | NaN | 91 | NaN |
9 | 1 | 94 | NaN |
10 | 4 | 105 | NaN |
11 | 2 | 121 | NaN |
12 | 1 | 128 | NaN |
result.unstack('year').stack('year')
count | ||
---|---|---|
month | year | |
1 | 2012 | 6 |
2013 | 161 | |
2 | 2012 | 4 |
2013 | 12 | |
3 | 2011 | 1 |
2012 | 29 | |
4 | 2011 | 2 |
2012 | 27 | |
5 | 2011 | 1 |
2012 | 44 | |
6 | 2011 | 1 |
2012 | 50 | |
7 | 2012 | 67 |
8 | 2012 | 91 |
9 | 2011 | 1 |
2012 | 94 | |
10 | 2011 | 4 |
2012 | 105 | |
11 | 2011 | 2 |
2012 | 121 | |
12 | 2011 | 1 |
2012 | 128 |
stats = ['count', 'sum', 'mean', 'median']
result = grouped.agg(stats)
result.index.names = ['year', 'month']
result
count | sum | mean | median | ||
---|---|---|---|---|---|
year | month | ||||
2011 | 3 | 1 | 1128 | 1128.000000 | 1128.0 |
4 | 2 | 3914 | 1957.000000 | 1957.0 | |
5 | 1 | 6454 | 6454.000000 | 6454.0 | |
6 | 1 | 2035 | 2035.000000 | 2035.0 | |
9 | 1 | 960 | 960.000000 | 960.0 | |
10 | 4 | 8590 | 2147.500000 | 1989.5 | |
11 | 2 | 1203 | 601.500000 | 601.5 | |
12 | 1 | 799 | 799.000000 | 799.0 | |
2012 | 1 | 6 | 4901 | 816.833333 | 544.5 |
2 | 4 | 3137 | 784.250000 | 505.0 | |
3 | 29 | 19587 | 675.413793 | 533.0 | |
4 | 27 | 16477 | 610.259259 | 487.0 | |
5 | 44 | 24072 | 547.090909 | 430.0 | |
6 | 50 | 21305 | 426.100000 | 272.5 | |
7 | 67 | 28852 | 430.626866 | 289.0 | |
8 | 91 | 26752 | 293.978022 | 206.0 | |
9 | 94 | 21712 | 230.978723 | 157.5 | |
10 | 105 | 16909 | 161.038095 | 125.0 | |
11 | 121 | 13561 | 112.074380 | 93.0 | |
12 | 128 | 10972 | 85.718750 | 66.0 | |
2013 | 1 | 161 | 10106 | 62.770186 | 52.0 |
2 | 12 | 414 | 34.500000 | 34.0 |
pd.options.display.max_columns = 20
pd.options.display.line_width = 1000
result.unstack('year')
count | sum | mean | median | |||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|
year | 2011 | 2012 | 2013 | 2011 | 2012 | 2013 | 2011 | 2012 | 2013 | 2011 | 2012 | 2013 |
month | ||||||||||||
1 | NaN | 6 | 161 | NaN | 4901 | 10106 | NaN | 816.833333 | 62.770186 | NaN | 544.5 | 52 |
2 | NaN | 4 | 12 | NaN | 3137 | 414 | NaN | 784.250000 | 34.500000 | NaN | 505.0 | 34 |
3 | 1 | 29 | NaN | 1128 | 19587 | NaN | 1128.0 | 675.413793 | NaN | 1128.0 | 533.0 | NaN |
4 | 2 | 27 | NaN | 3914 | 16477 | NaN | 1957.0 | 610.259259 | NaN | 1957.0 | 487.0 | NaN |
5 | 1 | 44 | NaN | 6454 | 24072 | NaN | 6454.0 | 547.090909 | NaN | 6454.0 | 430.0 | NaN |
6 | 1 | 50 | NaN | 2035 | 21305 | NaN | 2035.0 | 426.100000 | NaN | 2035.0 | 272.5 | NaN |
7 | NaN | 67 | NaN | NaN | 28852 | NaN | NaN | 430.626866 | NaN | NaN | 289.0 | NaN |
8 | NaN | 91 | NaN | NaN | 26752 | NaN | NaN | 293.978022 | NaN | NaN | 206.0 | NaN |
9 | 1 | 94 | NaN | 960 | 21712 | NaN | 960.0 | 230.978723 | NaN | 960.0 | 157.5 | NaN |
10 | 4 | 105 | NaN | 8590 | 16909 | NaN | 2147.5 | 161.038095 | NaN | 1989.5 | 125.0 | NaN |
11 | 2 | 121 | NaN | 1203 | 13561 | NaN | 601.5 | 112.074380 | NaN | 601.5 | 93.0 | NaN |
12 | 1 | 128 | NaN | 799 | 10972 | NaN | 799.0 | 85.718750 | NaN | 799.0 | 66.0 | NaN |
views.groupby(year).agg({'foo': np.mean})
foo | |
---|---|
2011 | 1929.461538 |
2012 | 271.849869 |
2013 | 60.809249 |
# BLACK MAGIC!
def q(q):
def f(x):
return x.quantile(q)
f.__name__ = 'quantile_%d' % int(q * 100)
return f
views.groupby(year).agg([q(0.1), q(0.5), q(0.9)])
quantile_10 | quantile_50 | quantile_90 | |
---|---|---|---|
2011 | 466.2 | 1128 | 3965.0 |
2012 | 58.0 | 143 | 599.0 |
2013 | 29.0 | 51 | 88.8 |
np.random.seed(130612001)
attendees = open('attendees.txt').read().split()
np.random.shuffle(attendees)
attendees = pd.Series(attendees)
attendees[:7]
0 khattri.vaibhav@gmail.com 1 giangdh89@gmail.com 2 johnchoo.lives@gmail.com 3 kukkillayavu@gis.a-star.edu.sg 4 wonglz@i2r.a-star.edu.sg 5 chenhl@i2r.a-star.edu.sg 6 kuanwh1996@gmail.com dtype: object
views.resample('A', how='sum')
PostTime 2011-12-31 25083 2012-12-31 208237 2013-12-31 10520 Freq: A-DEC, dtype: int64
views.resample('M', how='sum')
PostTime 2011-03-31 1128 2011-04-30 3914 2011-05-31 6454 2011-06-30 2035 2011-07-31 NaN 2011-08-31 NaN 2011-09-30 960 2011-10-31 8590 2011-11-30 1203 2011-12-31 799 2012-01-31 4901 2012-02-29 3137 2012-03-31 19587 2012-04-30 16477 2012-05-31 24072 2012-06-30 21305 2012-07-31 28852 2012-08-31 26752 2012-09-30 21712 2012-10-31 16909 2012-11-30 13561 2012-12-31 10972 2013-01-31 10106 2013-02-28 414 Freq: M, dtype: float64
views.groupby(year).sum()
2011 25083 2012 208237 2013 10520 dtype: int64
answers.DisplayName
0 Calvin Cheng 1 Calvin Cheng 2 Calvin Cheng 3 Calvin Cheng 4 Calvin Cheng 5 Calvin Cheng 6 Calvin Cheng 7 Calvin Cheng 8 Calvin Cheng 9 Calvin Cheng 10 Calvin Cheng 11 Calvin Cheng 12 Calvin Cheng 13 Calvin Cheng 14 Andy Hayden ... 1314 Tom 1315 Henry Gomersall 1316 milkypostman 1317 RParadox 1318 Mr. Squig 1319 Austin 1320 waitingkuo 1321 Victor K 1322 monkut 1323 Hyperboreus 1324 Dan Allan 1325 mankoff 1326 Yaroslav Halchenko 1327 Felix Zumstein 1328 dantes_419 Name: DisplayName, Length: 1329, dtype: object
answers.groupby('DisplayName').size()
DisplayName 0xc0de 1 Aaron 1 Aaron Digulla 1 Abe 1 Abhi 1 Abhranil Das 1 Abraham D Flaxman 1 Adam 1 Adam Greenhall 1 Adobe 1 Akavall 1 Alexander Stefanov 1 Alok 2 Aman 18 Amyunimus 1 ... user1569050 1 user1763885 1 user1827356 1 user1843099 3 user1917577 1 user1988295 1 user333700 3 user948652 2 vartec 1 velotron 1 waitingkuo 1 wiseveri 1 zach 4 zarthur 1 Óscar López 1 Length: 319, dtype: int64
num_answers = answers.groupby('DisplayName').size()
num_answers.order(ascending=False)
DisplayName Wes McKinney 208 Andy Hayden 116 Chang She 89 Wouter Overmeire 72 crewbum 40 BrenBarn 37 DSM 35 Zelazny7 34 eumiro 31 unutbu 30 root 20 bmu 18 Matti John 18 Aman 18 Rutger Kassies 14 ... Anurag Uniyal 1 Anaphory 1 Amyunimus 1 Alexander Stefanov 1 Akavall 1 Adobe 1 Adam Greenhall 1 Adam 1 Abraham D Flaxman 1 Abhranil Das 1 Abhi 1 Abe 1 Aaron Digulla 1 Aaron 1 0xc0de 1 Length: 319, dtype: int64
answers.Score
0 2 1 0 2 1 3 2 4 1 5 1 6 0 7 2 8 1 9 0 10 2 11 2 12 0 13 0 14 2 ... 1314 3 1315 4 1316 0 1317 1 1318 0 1319 1 1320 2 1321 2 1322 2 1323 1 1324 2 1325 4 1326 4 1327 1 1328 0 Name: Score, Length: 1329, dtype: int64
def sortd(self):
return self.order(ascending=False)
pd.Series.sortd = sortd
# pd.options.display.max_rows = 1000
by_name = answers.groupby('DisplayName')
result = by_name['Score'].agg(['mean', 'count'])
result = result.sort('mean', ascending=False)
result[:20]
mean | count | |
---|---|---|
DisplayName | ||
gcalmettes | 16.000000 | 1 |
Dirk Eddelbuettel | 12.000000 | 1 |
Matthew Dowle | 10.000000 | 2 |
tr33hous | 9.000000 | 1 |
Dani Arribas-Bel | 8.000000 | 1 |
Lennart Regebro | 8.000000 | 1 |
Joe Kington | 7.000000 | 1 |
minrk | 6.000000 | 1 |
lexual | 6.000000 | 1 |
Bi Rico | 6.000000 | 2 |
Sven Marnach | 5.000000 | 2 |
user1234440 | 5.000000 | 1 |
dbaupp | 5.000000 | 1 |
Brian Keegan | 5.000000 | 1 |
luke14free | 5.000000 | 1 |
jseabold | 5.000000 | 2 |
andrew cooke | 5.000000 | 1 |
Nick Crawford | 5.000000 | 1 |
Mike Pennington | 4.500000 | 2 |
ogrisel | 4.333333 | 3 |
result['count'] >= 10
DisplayName gcalmettes False Dirk Eddelbuettel False Matthew Dowle False tr33hous False Dani Arribas-Bel False Lennart Regebro False Joe Kington False minrk False lexual False Bi Rico False Sven Marnach False user1234440 False dbaupp False Brian Keegan False luke14free False ... gbronner False roysc False rocketman False Ross R False Rich Signell False Rian False Randy Olson False Rachel Gallen False Jason Morgan False nat False Jeff Tratner False mspadaccino False 0xc0de False velotron False Paul Hiemstra False Name: count, Length: 319, dtype: bool
(result['count'] >= 10).sum()
21
result[result['count'] >= 10]
mean | count | |
---|---|---|
DisplayName | ||
Paul H | 3.454545 | 11 |
unutbu | 3.000000 | 30 |
BrenBarn | 2.675676 | 37 |
Wes McKinney | 2.610577 | 208 |
DSM | 2.371429 | 35 |
EMS | 2.153846 | 13 |
Wouter Overmeire | 2.097222 | 72 |
bmu | 2.055556 | 18 |
eumiro | 1.967742 | 31 |
Aman | 1.888889 | 18 |
Zelazny7 | 1.882353 | 34 |
Matti John | 1.666667 | 18 |
Andy Hayden | 1.586207 | 116 |
crewbum | 1.525000 | 40 |
root | 1.500000 | 20 |
Jeff | 1.454545 | 11 |
Chang She | 1.325843 | 89 |
Rutger Kassies | 1.285714 | 14 |
lbolla | 1.272727 | 11 |
Calvin Cheng | 1.000000 | 14 |
locojay | 0.900000 | 10 |
stats = ['mean', 'min', 'max', 'std', 'count']
result = by_name['Score'].agg(stats)
result = result.sort('std', ascending=False)
result[result['count'] >= 10]
mean | min | max | std | count | |
---|---|---|---|---|---|
DisplayName | |||||
Paul H | 3.454545 | 1 | 18 | 4.926736 | 11 |
Wes McKinney | 2.610577 | 0 | 40 | 4.042920 | 208 |
BrenBarn | 2.675676 | 0 | 9 | 2.055535 | 37 |
bmu | 2.055556 | 0 | 6 | 1.862074 | 18 |
unutbu | 3.000000 | 0 | 8 | 1.819435 | 30 |
EMS | 2.153846 | 0 | 6 | 1.818706 | 13 |
Aman | 1.888889 | 0 | 5 | 1.604732 | 18 |
Wouter Overmeire | 2.097222 | 0 | 6 | 1.584784 | 72 |
crewbum | 1.525000 | 0 | 6 | 1.568725 | 40 |
eumiro | 1.967742 | 0 | 6 | 1.471595 | 31 |
DSM | 2.371429 | 0 | 6 | 1.373825 | 35 |
Zelazny7 | 1.882353 | 0 | 5 | 1.365477 | 34 |
Chang She | 1.325843 | 0 | 7 | 1.346505 | 89 |
root | 1.500000 | 0 | 5 | 1.317893 | 20 |
Andy Hayden | 1.586207 | 0 | 6 | 1.305966 | 116 |
Rutger Kassies | 1.285714 | 0 | 4 | 1.204388 | 14 |
lbolla | 1.272727 | 0 | 4 | 1.103713 | 11 |
Jeff | 1.454545 | 0 | 3 | 1.035725 | 11 |
locojay | 0.900000 | 0 | 3 | 0.994429 | 10 |
Matti John | 1.666667 | 0 | 4 | 0.970143 | 18 |
Calvin Cheng | 1.000000 | 0 | 2 | 0.877058 | 14 |
top_so_users = result[result['count'] >= 10].index
top_so_users
Index([Paul H, Wes McKinney, BrenBarn, bmu, unutbu, EMS, Aman, Wouter Overmeire, crewbum, eumiro, DSM, Zelazny7, Chang She, root, Andy Hayden, Rutger Kassies, lbolla, Jeff, locojay, Matti John, Calvin Cheng], dtype=object)
# Doing this 21 times would be slow
answers.DisplayName == 'Wes McKinney'
0 False 1 False 2 False 3 False 4 False 5 False 6 False 7 False 8 False 9 False 10 False 11 False 12 False 13 False 14 False ... 1314 False 1315 False 1316 False 1317 False 1318 False 1319 False 1320 False 1321 False 1322 False 1323 False 1324 False 1325 False 1326 False 1327 False 1328 False Name: DisplayName, Length: 1329, dtype: bool
'Wes McKinney' in top_so_users
True
answers.DisplayName.isin(top_so_users)
0 True 1 True 2 True 3 True 4 True 5 True 6 True 7 True 8 True 9 True 10 True 11 True 12 True 13 True 14 True ... 1314 False 1315 False 1316 False 1317 False 1318 False 1319 False 1320 False 1321 False 1322 False 1323 False 1324 False 1325 False 1326 False 1327 False 1328 False Name: DisplayName, Length: 1329, dtype: bool
crit = answers.DisplayName.isin(top_so_users)
answers_subset = answers[crit]
answers_subset
<class 'pandas.core.frame.DataFrame'> Int64Index: 850 entries, 0 to 945 Data columns (total 26 columns): Id_post 850 non-null values PostTypeId 850 non-null values ParentId 850 non-null values PostTime 850 non-null values Score 850 non-null values Body 850 non-null values OwnerUserId 850 non-null values OwnerDisplayName 0 non-null values LastEditorUserId 238 non-null values LastEditorDisplayName 1 non-null values LastEditDate 239 non-null values LastActivityDate 850 non-null values CommentCount 556 non-null values Id_user 850 non-null values Reputation 850 non-null values UserJoinTime 850 non-null values DisplayName 850 non-null values LastAccessDate 850 non-null values WebsiteUrl 414 non-null values Location 542 non-null values AboutMe 617 non-null values Views 850 non-null values UpVotes 850 non-null values DownVotes 850 non-null values EmailHash 850 non-null values Age 419 non-null values dtypes: datetime64[ns](1), float64(4), int64(9), object(12)
# Doh, have to convert to index to do this
year = pd.DatetimeIndex(answers_subset.PostTime).year
year[:20]
array([2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2013, 2013], dtype=int32)
keys = ['DisplayName', year]
grouped = answers_subset.groupby(keys)
stats = ['mean', 'min', 'max', 'std', 'count']
result = grouped['Score'].agg(stats)
# result = result.sort('std', ascending=False)
result
mean | min | max | std | count | ||
---|---|---|---|---|---|---|
DisplayName | ||||||
Aman | 2012 | 2.000000 | 0 | 5 | 1.581139 | 17 |
2013 | 0.000000 | 0 | 0 | NaN | 1 | |
Andy Hayden | 2012 | 1.450704 | 0 | 6 | 1.307000 | 71 |
2013 | 1.800000 | 0 | 5 | 1.289820 | 45 | |
BrenBarn | 2012 | 2.705882 | 0 | 9 | 2.139519 | 34 |
2013 | 2.333333 | 2 | 3 | 0.577350 | 3 | |
Calvin Cheng | 2012 | 1.000000 | 0 | 2 | 0.877058 | 14 |
Chang She | 2012 | 1.325843 | 0 | 7 | 1.346505 | 89 |
DSM | 2012 | 2.478261 | 0 | 6 | 1.473079 | 23 |
2013 | 2.166667 | 0 | 4 | 1.193416 | 12 | |
EMS | 2012 | 2.153846 | 0 | 6 | 1.818706 | 13 |
Jeff | 2012 | 1.500000 | 0 | 3 | 1.290994 | 4 |
2013 | 1.428571 | 0 | 3 | 0.975900 | 7 | |
Matti John | 2012 | 1.666667 | 0 | 4 | 0.970143 | 18 |
Paul H | 2012 | 2.000000 | 1 | 4 | 1.154701 | 7 |
2013 | 6.000000 | 1 | 18 | 8.041559 | 4 | |
Rutger Kassies | 2012 | 1.363636 | 0 | 4 | 1.286291 | 11 |
2013 | 1.000000 | 0 | 2 | 1.000000 | 3 | |
Wes McKinney | 2011 | 8.125000 | 1 | 21 | 7.180082 | 8 |
2012 | 2.450777 | 0 | 40 | 3.780282 | 193 | |
2013 | 0.714286 | 0 | 2 | 0.951190 | 7 | |
Wouter Overmeire | 2012 | 2.097222 | 0 | 6 | 1.584784 | 72 |
Zelazny7 | 2012 | 2.125000 | 0 | 5 | 1.726888 | 8 |
2013 | 1.807692 | 0 | 5 | 1.265519 | 26 | |
bmu | 2012 | 1.875000 | 0 | 5 | 1.668333 | 16 |
2013 | 3.500000 | 1 | 6 | 3.535534 | 2 | |
crewbum | 2012 | 1.354839 | 0 | 6 | 1.427081 | 31 |
2013 | 2.111111 | 0 | 5 | 1.964971 | 9 | |
eumiro | 2012 | 1.892857 | 0 | 6 | 1.523624 | 28 |
2013 | 2.666667 | 2 | 3 | 0.577350 | 3 | |
lbolla | 2012 | 1.272727 | 0 | 4 | 1.103713 | 11 |
locojay | 2012 | 0.777778 | 0 | 3 | 0.971825 | 9 |
2013 | 2.000000 | 2 | 2 | NaN | 1 | |
root | 2012 | 1.187500 | 0 | 5 | 1.223043 | 16 |
2013 | 2.750000 | 2 | 4 | 0.957427 | 4 | |
unutbu | 2011 | 8.000000 | 8 | 8 | NaN | 1 |
2012 | 3.000000 | 0 | 7 | 1.878673 | 18 | |
2013 | 2.545455 | 2 | 5 | 0.934199 | 11 |
np.log(users.Reputation).hist(bins=100)
<matplotlib.axes.AxesSubplot at 0x1075a5b90>
buckets = [0, 100, 1000, 10000, 1000000]
rep_bucket = pd.cut(answers.Reputation, buckets)
answers.groupby(rep_bucket).size()
Reputation (0, 100] 120 (100, 1000] 237 (1000, 10000] 783 (10000, 1000000] 189 dtype: int64
len(answers.DisplayName.unique())
319
grouped = answers.groupby(rep_bucket)
def distinct(names):
return len(names.unique())
statistics = {
'Score': ['mean', 'median', 'count'],
'Age': ['min', 'max'],
'DisplayName': [distinct]
}
grouped.agg(statistics)
Age | Score | DisplayName | ||||
---|---|---|---|---|---|---|
min | max | mean | median | count | distinct | |
Reputation | ||||||
(0, 100] | 23 | 45 | 0.966667 | 0 | 120 | 87 |
(100, 1000] | 17 | 46 | 1.413502 | 1 | 237 | 111 |
(1000, 10000] | 19 | 49 | 1.978289 | 1 | 783 | 83 |
(10000, 1000000] | 24 | 64 | 2.470899 | 2 | 189 | 38 |
users.irow(users.Reputation.argmax())
Id 20862 Reputation 220371 CreationDate 2008-09-23 02:06:20 DisplayName Ignacio Vazquez-Abrams LastAccessDate 2013-02-02 22:20:15 WebsiteUrl NaN Location NaN AboutMe <p><a href="http://whathaveyoutried.com/" rel=... Views 5231 UpVotes 1426 DownVotes 63 EmailHash 2343ae368d3241278581ce6c87f62a25 Age NaN Name: 532, dtype: object
import json
path = 'foods-2011-10-03.json'
db = json.load(open(path))
print type(db)
len(db)
<type 'list'>
6636
db[0]
{u'description': u'Cheese, caraway', u'group': u'Dairy and Egg Products', u'id': 1008, u'manufacturer': u'', u'nutrients': [{u'description': u'Protein', u'group': u'Composition', u'units': u'g', u'value': 25.18}, {u'description': u'Total lipid (fat)', u'group': u'Composition', u'units': u'g', u'value': 29.2}, {u'description': u'Carbohydrate, by difference', u'group': u'Composition', u'units': u'g', u'value': 3.06}, {u'description': u'Ash', u'group': u'Other', u'units': u'g', u'value': 3.28}, {u'description': u'Energy', u'group': u'Energy', u'units': u'kcal', u'value': 376.0}, {u'description': u'Water', u'group': u'Composition', u'units': u'g', u'value': 39.28}, {u'description': u'Energy', u'group': u'Energy', u'units': u'kJ', u'value': 1573.0}, {u'description': u'Fiber, total dietary', u'group': u'Composition', u'units': u'g', u'value': 0.0}, {u'description': u'Calcium, Ca', u'group': u'Elements', u'units': u'mg', u'value': 673.0}, {u'description': u'Iron, Fe', u'group': u'Elements', u'units': u'mg', u'value': 0.64}, {u'description': u'Magnesium, Mg', u'group': u'Elements', u'units': u'mg', u'value': 22.0}, {u'description': u'Phosphorus, P', u'group': u'Elements', u'units': u'mg', u'value': 490.0}, {u'description': u'Potassium, K', u'group': u'Elements', u'units': u'mg', u'value': 93.0}, {u'description': u'Sodium, Na', u'group': u'Elements', u'units': u'mg', u'value': 690.0}, {u'description': u'Zinc, Zn', u'group': u'Elements', u'units': u'mg', u'value': 2.94}, {u'description': u'Copper, Cu', u'group': u'Elements', u'units': u'mg', u'value': 0.024}, {u'description': u'Manganese, Mn', u'group': u'Elements', u'units': u'mg', u'value': 0.021}, {u'description': u'Selenium, Se', u'group': u'Elements', u'units': u'mcg', u'value': 14.5}, {u'description': u'Vitamin A, IU', u'group': u'Vitamins', u'units': u'IU', u'value': 1054.0}, {u'description': u'Retinol', u'group': u'Vitamins', u'units': u'mcg', u'value': 262.0}, {u'description': u'Vitamin A, RAE', u'group': u'Vitamins', u'units': u'mcg_RAE', u'value': 271.0}, {u'description': u'Vitamin C, total ascorbic acid', u'group': u'Vitamins', u'units': u'mg', u'value': 0.0}, {u'description': u'Thiamin', u'group': u'Vitamins', u'units': u'mg', u'value': 0.031}, {u'description': u'Riboflavin', u'group': u'Vitamins', u'units': u'mg', u'value': 0.45}, {u'description': u'Niacin', u'group': u'Vitamins', u'units': u'mg', u'value': 0.18}, {u'description': u'Pantothenic acid', u'group': u'Vitamins', u'units': u'mg', u'value': 0.19}, {u'description': u'Vitamin B-6', u'group': u'Vitamins', u'units': u'mg', u'value': 0.074}, {u'description': u'Folate, total', u'group': u'Vitamins', u'units': u'mcg', u'value': 18.0}, {u'description': u'Vitamin B-12', u'group': u'Vitamins', u'units': u'mcg', u'value': 0.27}, {u'description': u'Folic acid', u'group': u'Vitamins', u'units': u'mcg', u'value': 0.0}, {u'description': u'Folate, food', u'group': u'Vitamins', u'units': u'mcg', u'value': 18.0}, {u'description': u'Folate, DFE', u'group': u'Vitamins', u'units': u'mcg_DFE', u'value': 18.0}, {u'description': u'Cholesterol', u'group': u'Other', u'units': u'mg', u'value': 93.0}, {u'description': u'Fatty acids, total saturated', u'group': u'Other', u'units': u'g', u'value': 18.584}, {u'description': u'Fatty acids, total monounsaturated', u'group': u'Other', u'units': u'g', u'value': 8.275}, {u'description': u'Fatty acids, total polyunsaturated', u'group': u'Other', u'units': u'g', u'value': 0.83}, {u'description': u'Tryptophan', u'group': u'Amino Acids', u'units': u'g', u'value': 0.324}, {u'description': u'Threonine', u'group': u'Amino Acids', u'units': u'g', u'value': 0.896}, {u'description': u'Isoleucine', u'group': u'Amino Acids', u'units': u'g', u'value': 1.563}, {u'description': u'Leucine', u'group': u'Amino Acids', u'units': u'g', u'value': 2.412}, {u'description': u'Lysine', u'group': u'Amino Acids', u'units': u'g', u'value': 2.095}, {u'description': u'Methionine', u'group': u'Amino Acids', u'units': u'g', u'value': 0.659}, {u'description': u'Cystine', u'group': u'Amino Acids', u'units': u'g', u'value': 0.126}, {u'description': u'Phenylalanine', u'group': u'Amino Acids', u'units': u'g', u'value': 1.326}, {u'description': u'Tyrosine', u'group': u'Amino Acids', u'units': u'g', u'value': 1.216}, {u'description': u'Valine', u'group': u'Amino Acids', u'units': u'g', u'value': 1.682}, {u'description': u'Arginine', u'group': u'Amino Acids', u'units': u'g', u'value': 0.952}, {u'description': u'Histidine', u'group': u'Amino Acids', u'units': u'g', u'value': 0.884}, {u'description': u'Alanine', u'group': u'Amino Acids', u'units': u'g', u'value': 0.711}, {u'description': u'Aspartic acid', u'group': u'Amino Acids', u'units': u'g', u'value': 1.618}, {u'description': u'Glutamic acid', u'group': u'Amino Acids', u'units': u'g', u'value': 6.16}, {u'description': u'Glycine', u'group': u'Amino Acids', u'units': u'g', u'value': 0.439}, {u'description': u'Proline', u'group': u'Amino Acids', u'units': u'g', u'value': 2.838}, {u'description': u'Serine', u'group': u'Amino Acids', u'units': u'g', u'value': 1.472}, {u'description': u'Protein', u'group': u'Composition', u'units': u'g', u'value': 25.18}, {u'description': u'Total lipid (fat)', u'group': u'Composition', u'units': u'g', u'value': 29.2}, {u'description': u'Carbohydrate, by difference', u'group': u'Composition', u'units': u'g', u'value': 3.06}, {u'description': u'Ash', u'group': u'Other', u'units': u'g', u'value': 3.28}, {u'description': u'Energy', u'group': u'Energy', u'units': u'kcal', u'value': 376.0}, {u'description': u'Water', u'group': u'Composition', u'units': u'g', u'value': 39.28}, {u'description': u'Energy', u'group': u'Energy', u'units': u'kJ', u'value': 1573.0}, {u'description': u'Fiber, total dietary', u'group': u'Composition', u'units': u'g', u'value': 0.0}, {u'description': u'Calcium, Ca', u'group': u'Elements', u'units': u'mg', u'value': 673.0}, {u'description': u'Iron, Fe', u'group': u'Elements', u'units': u'mg', u'value': 0.64}, {u'description': u'Magnesium, Mg', u'group': u'Elements', u'units': u'mg', u'value': 22.0}, {u'description': u'Phosphorus, P', u'group': u'Elements', u'units': u'mg', u'value': 490.0}, {u'description': u'Potassium, K', u'group': u'Elements', u'units': u'mg', u'value': 93.0}, {u'description': u'Sodium, Na', u'group': u'Elements', u'units': u'mg', u'value': 690.0}, {u'description': u'Zinc, Zn', u'group': u'Elements', u'units': u'mg', u'value': 2.94}, {u'description': u'Copper, Cu', u'group': u'Elements', u'units': u'mg', u'value': 0.024}, {u'description': u'Manganese, Mn', u'group': u'Elements', u'units': u'mg', u'value': 0.021}, {u'description': u'Selenium, Se', u'group': u'Elements', u'units': u'mcg', u'value': 14.5}, {u'description': u'Vitamin A, IU', u'group': u'Vitamins', u'units': u'IU', u'value': 1054.0}, {u'description': u'Retinol', u'group': u'Vitamins', u'units': u'mcg', u'value': 262.0}, {u'description': u'Vitamin A, RAE', u'group': u'Vitamins', u'units': u'mcg_RAE', u'value': 271.0}, {u'description': u'Vitamin C, total ascorbic acid', u'group': u'Vitamins', u'units': u'mg', u'value': 0.0}, {u'description': u'Thiamin', u'group': u'Vitamins', u'units': u'mg', u'value': 0.031}, {u'description': u'Riboflavin', u'group': u'Vitamins', u'units': u'mg', u'value': 0.45}, {u'description': u'Niacin', u'group': u'Vitamins', u'units': u'mg', u'value': 0.18}, {u'description': u'Pantothenic acid', u'group': u'Vitamins', u'units': u'mg', u'value': 0.19}, {u'description': u'Vitamin B-6', u'group': u'Vitamins', u'units': u'mg', u'value': 0.074}, {u'description': u'Folate, total', u'group': u'Vitamins', u'units': u'mcg', u'value': 18.0}, {u'description': u'Vitamin B-12', u'group': u'Vitamins', u'units': u'mcg', u'value': 0.27}, {u'description': u'Folic acid', u'group': u'Vitamins', u'units': u'mcg', u'value': 0.0}, {u'description': u'Folate, food', u'group': u'Vitamins', u'units': u'mcg', u'value': 18.0}, {u'description': u'Folate, DFE', u'group': u'Vitamins', u'units': u'mcg_DFE', u'value': 18.0}, {u'description': u'Tryptophan', u'group': u'Amino Acids', u'units': u'g', u'value': 0.324}, {u'description': u'Threonine', u'group': u'Amino Acids', u'units': u'g', u'value': 0.896}, {u'description': u'Isoleucine', u'group': u'Amino Acids', u'units': u'g', u'value': 1.563}, {u'description': u'Leucine', u'group': u'Amino Acids', u'units': u'g', u'value': 2.412}, {u'description': u'Lysine', u'group': u'Amino Acids', u'units': u'g', u'value': 2.095}, {u'description': u'Methionine', u'group': u'Amino Acids', u'units': u'g', u'value': 0.659}, {u'description': u'Cystine', u'group': u'Amino Acids', u'units': u'g', u'value': 0.126}, {u'description': u'Phenylalanine', u'group': u'Amino Acids', u'units': u'g', u'value': 1.326}, {u'description': u'Tyrosine', u'group': u'Amino Acids', u'units': u'g', u'value': 1.216}, {u'description': u'Valine', u'group': u'Amino Acids', u'units': u'g', u'value': 1.682}, {u'description': u'Arginine', u'group': u'Amino Acids', u'units': u'g', u'value': 0.952}, {u'description': u'Histidine', u'group': u'Amino Acids', u'units': u'g', u'value': 0.884}, {u'description': u'Alanine', u'group': u'Amino Acids', u'units': u'g', u'value': 0.711}, {u'description': u'Aspartic acid', u'group': u'Amino Acids', u'units': u'g', u'value': 1.618}, {u'description': u'Glutamic acid', u'group': u'Amino Acids', u'units': u'g', u'value': 6.16}, {u'description': u'Glycine', u'group': u'Amino Acids', u'units': u'g', u'value': 0.439}, {u'description': u'Proline', u'group': u'Amino Acids', u'units': u'g', u'value': 2.838}, {u'description': u'Serine', u'group': u'Amino Acids', u'units': u'g', u'value': 1.472}, {u'description': u'Cholesterol', u'group': u'Other', u'units': u'mg', u'value': 93.0}, {u'description': u'Fatty acids, total saturated', u'group': u'Other', u'units': u'g', u'value': 18.584}, {u'description': u'Fatty acids, total monounsaturated', u'group': u'Other', u'units': u'g', u'value': 8.275}, {u'description': u'Fatty acids, total polyunsaturated', u'group': u'Other', u'units': u'g', u'value': 0.83}, {u'description': u'Protein', u'group': u'Composition', u'units': u'g', u'value': 25.18}, {u'description': u'Total lipid (fat)', u'group': u'Composition', u'units': u'g', u'value': 29.2}, {u'description': u'Carbohydrate, by difference', u'group': u'Composition', u'units': u'g', u'value': 3.06}, {u'description': u'Ash', u'group': u'Other', u'units': u'g', u'value': 3.28}, {u'description': u'Energy', u'group': u'Energy', u'units': u'kcal', u'value': 376.0}, {u'description': u'Water', u'group': u'Composition', u'units': u'g', u'value': 39.28}, {u'description': u'Energy', u'group': u'Energy', u'units': u'kJ', u'value': 1573.0}, {u'description': u'Fiber, total dietary', u'group': u'Composition', u'units': u'g', u'value': 0.0}, {u'description': u'Calcium, Ca', u'group': u'Elements', u'units': u'mg', u'value': 673.0}, {u'description': u'Iron, Fe', u'group': u'Elements', u'units': u'mg', u'value': 0.64}, {u'description': u'Magnesium, Mg', u'group': u'Elements', u'units': u'mg', u'value': 22.0}, {u'description': u'Phosphorus, P', u'group': u'Elements', u'units': u'mg', u'value': 490.0}, {u'description': u'Potassium, K', u'group': u'Elements', u'units': u'mg', u'value': 93.0}, {u'description': u'Sodium, Na', u'group': u'Elements', u'units': u'mg', u'value': 690.0}, {u'description': u'Zinc, Zn', u'group': u'Elements', u'units': u'mg', u'value': 2.94}, {u'description': u'Copper, Cu', u'group': u'Elements', u'units': u'mg', u'value': 0.024}, {u'description': u'Manganese, Mn', u'group': u'Elements', u'units': u'mg', u'value': 0.021}, {u'description': u'Selenium, Se', u'group': u'Elements', u'units': u'mcg', u'value': 14.5}, {u'description': u'Vitamin A, IU', u'group': u'Vitamins', u'units': u'IU', u'value': 1054.0}, {u'description': u'Retinol', u'group': u'Vitamins', u'units': u'mcg', u'value': 262.0}, {u'description': u'Vitamin A, RAE', u'group': u'Vitamins', u'units': u'mcg_RAE', u'value': 271.0}, {u'description': u'Vitamin C, total ascorbic acid', u'group': u'Vitamins', u'units': u'mg', u'value': 0.0}, {u'description': u'Thiamin', u'group': u'Vitamins', u'units': u'mg', u'value': 0.031}, {u'description': u'Riboflavin', u'group': u'Vitamins', u'units': u'mg', u'value': 0.45}, {u'description': u'Niacin', u'group': u'Vitamins', u'units': u'mg', u'value': 0.18}, {u'description': u'Pantothenic acid', u'group': u'Vitamins', u'units': u'mg', u'value': 0.19}, {u'description': u'Vitamin B-6', u'group': u'Vitamins', u'units': u'mg', u'value': 0.074}, {u'description': u'Folate, total', u'group': u'Vitamins', u'units': u'mcg', u'value': 18.0}, {u'description': u'Vitamin B-12', u'group': u'Vitamins', u'units': u'mcg', u'value': 0.27}, {u'description': u'Folic acid', u'group': u'Vitamins', u'units': u'mcg', u'value': 0.0}, {u'description': u'Folate, food', u'group': u'Vitamins', u'units': u'mcg', u'value': 18.0}, {u'description': u'Folate, DFE', u'group': u'Vitamins', u'units': u'mcg_DFE', u'value': 18.0}, {u'description': u'Tryptophan', u'group': u'Amino Acids', u'units': u'g', u'value': 0.324}, {u'description': u'Threonine', u'group': u'Amino Acids', u'units': u'g', u'value': 0.896}, {u'description': u'Isoleucine', u'group': u'Amino Acids', u'units': u'g', u'value': 1.563}, {u'description': u'Leucine', u'group': u'Amino Acids', u'units': u'g', u'value': 2.412}, {u'description': u'Lysine', u'group': u'Amino Acids', u'units': u'g', u'value': 2.095}, {u'description': u'Methionine', u'group': u'Amino Acids', u'units': u'g', u'value': 0.659}, {u'description': u'Cystine', u'group': u'Amino Acids', u'units': u'g', u'value': 0.126}, {u'description': u'Phenylalanine', u'group': u'Amino Acids', u'units': u'g', u'value': 1.326}, {u'description': u'Tyrosine', u'group': u'Amino Acids', u'units': u'g', u'value': 1.216}, {u'description': u'Valine', u'group': u'Amino Acids', u'units': u'g', u'value': 1.682}, {u'description': u'Arginine', u'group': u'Amino Acids', u'units': u'g', u'value': 0.952}, {u'description': u'Histidine', u'group': u'Amino Acids', u'units': u'g', u'value': 0.884}, {u'description': u'Alanine', u'group': u'Amino Acids', u'units': u'g', u'value': 0.711}, {u'description': u'Aspartic acid', u'group': u'Amino Acids', u'units': u'g', u'value': 1.618}, {u'description': u'Glutamic acid', u'group': u'Amino Acids', u'units': u'g', u'value': 6.16}, {u'description': u'Glycine', u'group': u'Amino Acids', u'units': u'g', u'value': 0.439}, {u'description': u'Proline', u'group': u'Amino Acids', u'units': u'g', u'value': 2.838}, {u'description': u'Serine', u'group': u'Amino Acids', u'units': u'g', u'value': 1.472}, {u'description': u'Cholesterol', u'group': u'Other', u'units': u'mg', u'value': 93.0}, {u'description': u'Fatty acids, total saturated', u'group': u'Other', u'units': u'g', u'value': 18.584}, {u'description': u'Fatty acids, total monounsaturated', u'group': u'Other', u'units': u'g', u'value': 8.275}, {u'description': u'Fatty acids, total polyunsaturated', u'group': u'Other', u'units': u'g', u'value': 0.83}], u'portions': [{u'amount': 1, u'grams': 28.35, u'unit': u'oz'}], u'tags': []}
db[0]['nutrients'][0]
{u'description': u'Protein', u'group': u'Composition', u'units': u'g', u'value': 25.18}
pd.DataFrame([{'a': 1, 'b': 2},
{'a': 2, 'b': 3, 'c': 4}])
a | b | c | |
---|---|---|---|
0 | 1 | 2 | NaN |
1 | 2 | 3 | 4 |
pd.DataFrame(db[0]['nutrients'])
<class 'pandas.core.frame.DataFrame'> Int64Index: 162 entries, 0 to 161 Data columns (total 4 columns): description 162 non-null values group 162 non-null values units 162 non-null values value 162 non-null values dtypes: float64(1), object(3)
pd.DataFrame(db[0]['nutrients']).head()
description | group | units | value | |
---|---|---|---|---|
0 | Protein | Composition | g | 25.18 |
1 | Total lipid (fat) | Composition | g | 29.20 |
2 | Carbohydrate, by difference | Composition | g | 3.06 |
3 | Ash | Other | g | 3.28 |
4 | Energy | Energy | kcal | 376.00 |
db[0]['nutrients'][0]
{u'description': u'Protein', u'group': u'Composition', u'units': u'g', u'value': 25.18}
db[0]['id']
1008
all_nutrients = []
for rec in db:
for nutrient in rec['nutrients']:
nutrient['id'] = rec['id']
all_nutrients.append(nutrient)
nutrients = pd.DataFrame(all_nutrients)
nutrients.head()
description | group | id | units | value | |
---|---|---|---|---|---|
0 | Protein | Composition | 1008 | g | 25.18 |
1 | Total lipid (fat) | Composition | 1008 | g | 29.20 |
2 | Carbohydrate, by difference | Composition | 1008 | g | 3.06 |
3 | Ash | Other | 1008 | g | 3.28 |
4 | Energy | Energy | 1008 | kcal | 376.00 |
pd.DataFrame(all_nutrients, columns=['value'])
<class 'pandas.core.frame.DataFrame'> Int64Index: 389355 entries, 0 to 389354 Data columns (total 1 columns): value 389355 non-null values dtypes: float64(1)
of_interest = ['id', 'description', 'group']
meta = pd.DataFrame(db, columns=of_interest)
meta
<class 'pandas.core.frame.DataFrame'> Int64Index: 6636 entries, 0 to 6635 Data columns (total 3 columns): id 6636 non-null values description 6636 non-null values group 6636 non-null values dtypes: int64(1), object(2)
nutrients
<class 'pandas.core.frame.DataFrame'> Int64Index: 389355 entries, 0 to 389354 Data columns (total 5 columns): description 389355 non-null values group 389355 non-null values id 389355 non-null values units 389355 non-null values value 389355 non-null values dtypes: float64(1), int64(1), object(3)
mapping = {
'description': 'nutrient',
'group': 'ngroup'
}
nutrients = nutrients.rename(columns=mapping)
# COULD HAVE DONE
# nutrients.rename(columns=mapping, inplace=True)
mapping = {
'description': 'food',
'group': 'fgroup'
}
meta = meta.rename(columns=mapping)
data = pd.merge(meta, nutrients, on='id')
data
<class 'pandas.core.frame.DataFrame'> Int64Index: 389355 entries, 0 to 389354 Data columns (total 7 columns): id 389355 non-null values food 389355 non-null values fgroup 389355 non-null values nutrient 389355 non-null values ngroup 389355 non-null values units 389355 non-null values value 389355 non-null values dtypes: float64(1), int64(1), object(5)
cafdata = data[data.nutrient == 'Caffeine']
cafdata
<class 'pandas.core.frame.DataFrame'> Int64Index: 3911 entries, 172 to 389312 Data columns (total 7 columns): id 3911 non-null values food 3911 non-null values fgroup 3911 non-null values nutrient 3911 non-null values ngroup 3911 non-null values units 3911 non-null values value 3911 non-null values dtypes: float64(1), int64(1), object(5)
data.ix[172]
id 1009 food Cheese, cheddar fgroup Dairy and Egg Products nutrient Caffeine ngroup Other units mg value 0 Name: 172, dtype: object
cafdata.ix[172]
id 1009 food Cheese, cheddar fgroup Dairy and Egg Products nutrient Caffeine ngroup Other units mg value 0 Name: 172, dtype: object
cafdata.units.value_counts()
mg 3911 dtype: int64
# MAXIMUM IN NUMPY
cafdata.value.values.argmax()
3460
# INDEX OF THE MAXIMUM VALUE
cafdata.value.idxmax()
336702
data.ix[cafdata.value.idxmax()]
id 14366 food Tea, instant, unsweetened, powder fgroup Beverages nutrient Caffeine ngroup Other units mg value 3680 Name: 336702, dtype: object
cafdata.dtypes
id int64 food object fgroup object nutrient object ngroup object units object value float64 dtype: object
def max_value(group):
return group.ix[group['value'].idxmax()]
max_value(cafdata)
id 14366 food Tea, instant, unsweetened, powder fgroup Beverages nutrient Caffeine ngroup Other units mg value 3680 Name: 336702, dtype: object
cafdata.groupby('fgroup').apply(max_value).dtypes
id object food object fgroup object nutrient object ngroup object units object value object dtype: object
cafdata.groupby('fgroup').apply(max_value)
id | food | fgroup | nutrient | ngroup | units | value | |
---|---|---|---|---|---|---|---|
fgroup | |||||||
Baby Foods | 3008 | Babyfood, meat, ham, strained | Baby Foods | Caffeine | Other | mg | 0 |
Baked Products | 18174 | Cookies, graham crackers, chocolate-coated | Baked Products | Caffeine | Other | mg | 46 |
Beef Products | 13000 | Beef, grass-fed, strip steaks, lean only, raw | Beef Products | Caffeine | Other | mg | 0 |
Beverages | 14366 | Tea, instant, unsweetened, powder | Beverages | Caffeine | Other | mg | 3680 |
Breakfast Cereals | 8294 | Cereals ready-to-eat, QUAKER, QUAKER COCOA BLASTS | Breakfast Cereals | Caffeine | Other | mg | 21 |
Cereal Grains and Pasta | 20004 | Barley, hulled | Cereal Grains and Pasta | Caffeine | Other | mg | 0 |
Dairy and Egg Products | 43260 | Beverage, instant breakfast powder, chocolate,... | Dairy and Egg Products | Caffeine | Other | mg | 52 |
Ethnic Foods | 35009 | Whale, beluga, meat, dried (Alaska Native) | Ethnic Foods | Caffeine | Other | mg | 0 |
Fast Foods | 21030 | Fast foods, cookies, chocolate chip | Fast Foods | Caffeine | Other | mg | 11 |
Fats and Oils | 2050 | Lard | Fats and Oils | Caffeine | Other | mg | 0 |
Finfish and Shellfish Products | 15001 | Fish, anchovy, european, raw | Finfish and Shellfish Products | Caffeine | Other | mg | 0 |
Fruits and Fruit Juices | 9002 | Acerola juice, raw | Fruits and Fruit Juices | Caffeine | Other | mg | 0 |
Lamb, Veal, and Game Products | 17094 | Veal, leg (top round), separable lean and fat,... | Lamb, Veal, and Game Products | Caffeine | Other | mg | 0 |
Legumes and Legume Products | 16166 | Soymilk, chocolate, unfortified | Legumes and Legume Products | Caffeine | Other | mg | 2 |
Meals, Entrees, and Sidedishes | 22247 | Macaroni and Cheese, canned entree | Meals, Entrees, and Sidedishes | Caffeine | Other | mg | 0 |
Nut and Seed Products | 12014 | Seeds, pumpkin and squash seed kernels, dried | Nut and Seed Products | Caffeine | Other | mg | 0 |
Pork Products | 10000 | Pork, fresh, enhanced, composite of separable ... | Pork Products | Caffeine | Other | mg | 0 |
Poultry Products | 5008 | Chicken, broilers or fryers, meat and skin, co... | Poultry Products | Caffeine | Other | mg | 0 |
Restaurant Foods | 36617 | Restaurant, Chinese, lemon chicken | Restaurant Foods | Caffeine | Other | mg | 0 |
Sausages and Luncheon Meats | 7008 | Bologna, beef and pork | Sausages and Luncheon Meats | Caffeine | Other | mg | 0 |
Snacks | 25031 | Formulated bar, ZONE PERFECT CLASSIC CRUNCH BA... | Snacks | Caffeine | Other | mg | 11 |
Soups, Sauces, and Gravies | 3073 | Soup, cream of asparagus, canned, condensed | Soups, Sauces, and Gravies | Caffeine | Other | mg | 0 |
Spices and Herbs | 1025 | Spices, allspice, ground | Spices and Herbs | Caffeine | Other | mg | 0 |
Sweets | 19268 | Candies, dark chocolate coated coffee beans | Sweets | Caffeine | Other | mg | 839 |
Vegetables and Vegetable Products | 11001 | Alfalfa seeds, sprouted, raw | Vegetables and Vegetable Products | Caffeine | Other | mg | 0 |
grouped = data.groupby(['nutrient', 'fgroup'])
results = grouped.apply(max_value)
results
<class 'pandas.core.frame.DataFrame'> MultiIndex: 2246 entries, (Adjusted Protein, Sweets) to (Zinc, Zn, Vegetables and Vegetable Products) Data columns (total 7 columns): id 2246 non-null values food 2246 non-null values fgroup 2246 non-null values nutrient 2246 non-null values ngroup 2246 non-null values units 2246 non-null values value 2246 non-null values dtypes: object(7)
pd.options.display.line_width = 10000
def topn(group, by=None, n=5):
return group.sort(by, ascending=False)[:n]
result = cafdata.groupby('fgroup').apply(topn, by='value', n=3)
result.ix['Beverages']
id | food | fgroup | nutrient | ngroup | units | value | |
---|---|---|---|---|---|---|---|
336702 | 14366 | Tea, instant, unsweetened, powder | Beverages | Caffeine | Other | mg | 3680 |
334542 | 14214 | Coffee, instant, regular, powder | Beverages | Caffeine | Other | mg | 3142 |
336913 | 14375 | Tea, instant, sweetened with sodium saccharin,... | Beverages | Caffeine | Other | mg | 2240 |
data.nutrient.unique()
array([u'Protein', u'Total lipid (fat)', u'Carbohydrate, by difference', u'Ash', u'Energy', u'Water', u'Fiber, total dietary', u'Calcium, Ca', u'Iron, Fe', u'Magnesium, Mg', u'Phosphorus, P', u'Potassium, K', u'Sodium, Na', u'Zinc, Zn', u'Copper, Cu', u'Manganese, Mn', u'Selenium, Se', u'Vitamin A, IU', u'Retinol', u'Vitamin A, RAE', u'Vitamin C, total ascorbic acid', u'Thiamin', u'Riboflavin', u'Niacin', u'Pantothenic acid', u'Vitamin B-6', u'Folate, total', u'Vitamin B-12', u'Folic acid', u'Folate, food', u'Folate, DFE', u'Cholesterol', u'Fatty acids, total saturated', u'Fatty acids, total monounsaturated', u'Fatty acids, total polyunsaturated', u'Tryptophan', u'Threonine', u'Isoleucine', u'Leucine', u'Lysine', u'Methionine', u'Cystine', u'Phenylalanine', u'Tyrosine', u'Valine', u'Arginine', u'Histidine', u'Alanine', u'Aspartic acid', u'Glutamic acid', u'Glycine', u'Proline', u'Serine', u'Sucrose', u'Lactose', u'Maltose', u'Alcohol, ethyl', u'Caffeine', u'Theobromine', u'Sugars, total', u'Fluoride, F', u'Carotene, beta', u'Carotene, alpha', u'Vitamin E (alpha-tocopherol)', u'Vitamin D', u'Vitamin D3 (cholecalciferol)', u'Vitamin D (D2 + D3)', u'Cryptoxanthin, beta', u'Lycopene', u'Lutein + zeaxanthin', u'Tocopherol, gamma', u'Tocopherol, delta', u'Choline, total', u'Vitamin K (phylloquinone)', u'Betaine', u'Vitamin E, added', u'Vitamin B-12, added', u'Dihydrophylloquinone', u'Tocopherol, beta', u'Phytosterols', u'Glucose (dextrose)', u'Fructose', u'Galactose', u'Fatty acids, total trans', u'Stigmasterol', u'Campesterol', u'Beta-sitosterol', u'Fatty acids, total trans-monoenoic', u'Fatty acids, total trans-polyenoic', u'Hydroxyproline', u'Starch', u'Menaquinone-4', u'Vitamin D2 (ergocalciferol)', u'Adjusted Protein'], dtype=object)
grouped = data.groupby(['nutrient', 'fgroup'])
results = grouped.apply(max_value)
results.ix['Zinc, Zn']
id | food | fgroup | nutrient | ngroup | units | value | |
---|---|---|---|---|---|---|---|
fgroup | |||||||
Baby Foods | 3192 | Babyfood, cereal, oatmeal, with applesauce and... | Baby Foods | Zinc, Zn | Elements | mg | 7.5 |
Baked Products | 18374 | Leavening agents, yeast, baker's, compressed | Baked Products | Zinc, Zn | Elements | mg | 9.97 |
Beef Products | 13981 | Beef, chuck, short ribs, boneless, separable l... | Beef Products | Zinc, Zn | Elements | mg | 12.28 |
Beverages | 14557 | Chocolate-flavor beverage mix for milk, powder... | Beverages | Zinc, Zn | Elements | mg | 6.82 |
Breakfast Cereals | 8504 | Cereals ready-to-eat, Ralston Enriched Bran fl... | Breakfast Cereals | Zinc, Zn | Elements | mg | 64.33 |
Cereal Grains and Pasta | 20078 | Wheat germ, crude | Cereal Grains and Pasta | Zinc, Zn | Elements | mg | 12.29 |
Dairy and Egg Products | 43260 | Beverage, instant breakfast powder, chocolate,... | Dairy and Egg Products | Zinc, Zn | Elements | mg | 15.4 |
Ethnic Foods | 35194 | Agave, dried (Southwest) | Ethnic Foods | Zinc, Zn | Elements | mg | 12.1 |
Fast Foods | 21048 | Fast foods, oysters, battered or breaded, and ... | Fast Foods | Zinc, Zn | Elements | mg | 11.25 |
Fats and Oils | 2066 | Salad dressing, russian dressing, low calorie | Fats and Oils | Zinc, Zn | Elements | mg | 2.41 |
Finfish and Shellfish Products | 15170 | Mollusks, oyster, eastern, canned | Finfish and Shellfish Products | Zinc, Zn | Elements | mg | 90.95 |
Fruits and Fruit Juices | 9030 | Apricots, dehydrated (low-moisture), sulfured,... | Fruits and Fruit Juices | Zinc, Zn | Elements | mg | 1 |
Lamb, Veal, and Game Products | 17202 | Veal, variety meats and by-products, liver, raw | Lamb, Veal, and Game Products | Zinc, Zn | Elements | mg | 12.02 |
Legumes and Legume Products | 16176 | Soy protein isolate, PROTEIN TECHNOLOGIES INTE... | Legumes and Legume Products | Zinc, Zn | Elements | mg | 60 |
Meals, Entrees, and Sidedishes | 22959 | Macaroni and cheese dinner with dry sauce mix,... | Meals, Entrees, and Sidedishes | Zinc, Zn | Elements | mg | 1.42 |
Nut and Seed Products | 12011 | Seeds, cottonseed meal, partially defatted (gl... | Nut and Seed Products | Zinc, Zn | Elements | mg | 12.32 |
Pork Products | 10111 | Pork, fresh, variety meats and by-products, li... | Pork Products | Zinc, Zn | Elements | mg | 6.72 |
Poultry Products | 5180 | Turkey, all classes, neck, meat only, cooked, ... | Poultry Products | Zinc, Zn | Elements | mg | 7.12 |
Restaurant Foods | 36608 | CRACKER BARREL, grilled sirloin steak | Restaurant Foods | Zinc, Zn | Elements | mg | 5.67 |
Sausages and Luncheon Meats | 7912 | Roast beef spread | Sausages and Luncheon Meats | Zinc, Zn | Elements | mg | 6.08 |
Snacks | 25006 | Formulated bar, MARS SNACKFOOD US, SNICKERS Ma... | Snacks | Zinc, Zn | Elements | mg | 13.57 |
Soups, Sauces, and Gravies | 6048 | Soup, oyster stew, canned, condensed | Soups, Sauces, and Gravies | Zinc, Zn | Elements | mg | 8.4 |
Spices and Herbs | 2008 | Spices, chervil, dried | Spices and Herbs | Zinc, Zn | Elements | mg | 8.8 |
Sweets | 19078 | Baking chocolate, unsweetened, squares | Sweets | Zinc, Zn | Elements | mg | 9.63 |
Vegetables and Vegetable Products | 11268 | Mushrooms, shiitake, dried | Vegetables and Vegetable Products | Zinc, Zn | Elements | mg | 7.66 |
results.ix['Zinc, Zn'].value
fgroup Baby Foods 7.5 Baked Products 9.97 Beef Products 12.28 Beverages 6.82 Breakfast Cereals 64.33 Cereal Grains and Pasta 12.29 Dairy and Egg Products 15.4 Ethnic Foods 12.1 Fast Foods 11.25 Fats and Oils 2.41 Finfish and Shellfish Products 90.95 Fruits and Fruit Juices 1 Lamb, Veal, and Game Products 12.02 Legumes and Legume Products 60 Meals, Entrees, and Sidedishes 1.42 Nut and Seed Products 12.32 Pork Products 6.72 Poultry Products 7.12 Restaurant Foods 5.67 Sausages and Luncheon Meats 6.08 Snacks 13.57 Soups, Sauces, and Gravies 8.4 Spices and Herbs 8.8 Sweets 9.63 Vegetables and Vegetable Products 7.66 Name: value, dtype: object
results.ix['Zinc, Zn'].value.plot(kind='barh')
<matplotlib.axes.AxesSubplot at 0x10100a190>
plt.figure(figsize=(10, 8))
medians = data.groupby(['nutrient', 'fgroup']).value.median()
medians.ix['Zinc, Zn'].plot(kind='barh')
<matplotlib.axes.AxesSubplot at 0x11e151090>
plt.figure(figsize=(10, 8))
maxes = data.groupby(['nutrient', 'fgroup']).value.max()
np.log(maxes.ix['Caffeine']).plot(kind='barh')
<matplotlib.axes.AxesSubplot at 0x106c3abd0>
data[['food', 'nutrient']].describe()
food | nutrient | |
---|---|---|
count | 389355 | 389355 |
unique | 6636 | 94 |
top | Milk, reduced fat, fluid, 2% milkfat, with add... | Energy |
freq | 270 | 13704 |