# Python でやってみる『"データサイエンティスト養成読本" 特集1 第1章 Rで統計解析を始めよう』
# list1
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# list2
## サンプルデータ
## http://gihyo.jp/book/2013/978-4-7741-5896-9/support
body_data = pd.read_csv("body_sample.csv")
body_data.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 400 entries, 0 to 399 Data columns (total 4 columns): id 400 non-null int64 gender 400 non-null object height 400 non-null float64 weight 400 non-null float64 dtypes: float64(2), int64(1), object(1)
# list3 データの基本操作
body_data[[1]] # 列番号を指定して取得
gender | |
---|---|
0 | M |
1 | M |
2 | M |
3 | M |
4 | M |
5 | M |
6 | M |
7 | M |
8 | M |
9 | M |
10 | M |
11 | M |
12 | M |
13 | M |
14 | M |
15 | M |
16 | M |
17 | M |
18 | M |
19 | M |
20 | M |
21 | M |
22 | M |
23 | M |
24 | M |
25 | M |
26 | M |
27 | M |
28 | M |
29 | M |
... | ... |
370 | F |
371 | F |
372 | F |
373 | F |
374 | F |
375 | F |
376 | F |
377 | F |
378 | F |
379 | F |
380 | F |
381 | F |
382 | F |
383 | F |
384 | F |
385 | F |
386 | F |
387 | F |
388 | F |
389 | F |
390 | F |
391 | F |
392 | F |
393 | F |
394 | F |
395 | F |
396 | F |
397 | F |
398 | F |
399 | F |
400 rows × 1 columns
body_data[[0, 2]] # 複数の列番号を指定して取得
id | height | |
---|---|---|
0 | 1 | 157.67 |
1 | 2 | 178.76 |
2 | 3 | 161.95 |
3 | 4 | 162.26 |
4 | 5 | 167.95 |
5 | 6 | 165.59 |
6 | 7 | 163.66 |
7 | 8 | 171.78 |
8 | 9 | 161.11 |
9 | 10 | 160.97 |
10 | 11 | 163.69 |
11 | 12 | 152.74 |
12 | 13 | 157.58 |
13 | 14 | 167.95 |
14 | 15 | 170.35 |
15 | 16 | 171.33 |
16 | 17 | 166.38 |
17 | 18 | 167.69 |
18 | 19 | 170.19 |
19 | 20 | 172.97 |
20 | 21 | 157.60 |
21 | 22 | 165.60 |
22 | 23 | 161.91 |
23 | 24 | 153.29 |
24 | 25 | 162.09 |
25 | 26 | 168.42 |
26 | 27 | 163.93 |
27 | 28 | 157.76 |
28 | 29 | 154.93 |
29 | 30 | 155.84 |
... | ... | ... |
370 | 371 | 154.89 |
371 | 372 | 155.31 |
372 | 373 | 160.31 |
373 | 374 | 151.83 |
374 | 375 | 151.46 |
375 | 376 | 154.98 |
376 | 377 | 150.91 |
377 | 378 | 153.04 |
378 | 379 | 154.76 |
379 | 380 | 154.67 |
380 | 381 | 158.71 |
381 | 382 | 149.25 |
382 | 383 | 167.48 |
383 | 384 | 159.67 |
384 | 385 | 151.79 |
385 | 386 | 148.39 |
386 | 387 | 158.86 |
387 | 388 | 160.72 |
388 | 389 | 151.20 |
389 | 390 | 151.27 |
390 | 391 | 141.49 |
391 | 392 | 167.83 |
392 | 393 | 156.41 |
393 | 394 | 145.48 |
394 | 395 | 160.13 |
395 | 396 | 163.84 |
396 | 397 | 161.23 |
397 | 398 | 161.86 |
398 | 399 | 150.88 |
399 | 400 | 146.28 |
400 rows × 2 columns
body_data["weight"] # 列名で取得
0 64.82 1 72.38 2 64.52 3 63.35 4 68.76 5 66.40 6 64.22 7 67.76 8 60.76 9 60.64 10 64.36 11 58.86 12 60.88 13 66.76 14 71.70 ... 385 47.84 386 53.94 387 60.88 388 47.62 389 44.98 390 41.07 391 67.78 392 52.70 393 45.13 394 57.55 395 59.15 396 56.51 397 57.02 398 50.60 399 42.26 Name: weight, Length: 400, dtype: float64
body_data.weight # .列名で取得
0 64.82 1 72.38 2 64.52 3 63.35 4 68.76 5 66.40 6 64.22 7 67.76 8 60.76 9 60.64 10 64.36 11 58.86 12 60.88 13 66.76 14 71.70 ... 385 47.84 386 53.94 387 60.88 388 47.62 389 44.98 390 41.07 391 67.78 392 52.70 393 45.13 394 57.55 395 59.15 396 56.51 397 57.02 398 50.60 399 42.26 Name: weight, Length: 400, dtype: float64
body_data[["id", "height"]] # 複数の列名で取得
id | height | |
---|---|---|
0 | 1 | 157.67 |
1 | 2 | 178.76 |
2 | 3 | 161.95 |
3 | 4 | 162.26 |
4 | 5 | 167.95 |
5 | 6 | 165.59 |
6 | 7 | 163.66 |
7 | 8 | 171.78 |
8 | 9 | 161.11 |
9 | 10 | 160.97 |
10 | 11 | 163.69 |
11 | 12 | 152.74 |
12 | 13 | 157.58 |
13 | 14 | 167.95 |
14 | 15 | 170.35 |
15 | 16 | 171.33 |
16 | 17 | 166.38 |
17 | 18 | 167.69 |
18 | 19 | 170.19 |
19 | 20 | 172.97 |
20 | 21 | 157.60 |
21 | 22 | 165.60 |
22 | 23 | 161.91 |
23 | 24 | 153.29 |
24 | 25 | 162.09 |
25 | 26 | 168.42 |
26 | 27 | 163.93 |
27 | 28 | 157.76 |
28 | 29 | 154.93 |
29 | 30 | 155.84 |
... | ... | ... |
370 | 371 | 154.89 |
371 | 372 | 155.31 |
372 | 373 | 160.31 |
373 | 374 | 151.83 |
374 | 375 | 151.46 |
375 | 376 | 154.98 |
376 | 377 | 150.91 |
377 | 378 | 153.04 |
378 | 379 | 154.76 |
379 | 380 | 154.67 |
380 | 381 | 158.71 |
381 | 382 | 149.25 |
382 | 383 | 167.48 |
383 | 384 | 159.67 |
384 | 385 | 151.79 |
385 | 386 | 148.39 |
386 | 387 | 158.86 |
387 | 388 | 160.72 |
388 | 389 | 151.20 |
389 | 390 | 151.27 |
390 | 391 | 141.49 |
391 | 392 | 167.83 |
392 | 393 | 156.41 |
393 | 394 | 145.48 |
394 | 395 | 160.13 |
395 | 396 | 163.84 |
396 | 397 | 161.23 |
397 | 398 | 161.86 |
398 | 399 | 150.88 |
399 | 400 | 146.28 |
400 rows × 2 columns
body_data[body_data.gender == "F"] # 条件にあった行だけ取り出す
id | gender | height | weight | |
---|---|---|---|---|
200 | 201 | F | 157.64 | 51.16 |
201 | 202 | F | 146.67 | 43.98 |
202 | 203 | F | 154.72 | 56.15 |
203 | 204 | F | 158.33 | 53.37 |
204 | 205 | F | 153.25 | 49.22 |
205 | 206 | F | 165.35 | 62.92 |
206 | 207 | F | 147.87 | 47.59 |
207 | 208 | F | 150.70 | 43.52 |
208 | 209 | F | 146.92 | 43.73 |
209 | 210 | F | 137.89 | 40.85 |
210 | 211 | F | 150.98 | 49.94 |
211 | 212 | F | 153.57 | 53.25 |
212 | 213 | F | 152.95 | 48.96 |
213 | 214 | F | 157.06 | 49.27 |
214 | 215 | F | 153.07 | 51.64 |
215 | 216 | F | 160.87 | 57.23 |
216 | 217 | F | 149.80 | 51.29 |
217 | 218 | F | 161.15 | 55.11 |
218 | 219 | F | 165.63 | 65.16 |
219 | 220 | F | 147.34 | 44.93 |
220 | 221 | F | 159.46 | 59.56 |
221 | 222 | F | 143.69 | 45.75 |
222 | 223 | F | 155.91 | 53.43 |
223 | 224 | F | 161.78 | 57.97 |
224 | 225 | F | 151.21 | 50.32 |
225 | 226 | F | 165.19 | 63.59 |
226 | 227 | F | 149.45 | 48.54 |
227 | 228 | F | 143.75 | 43.67 |
228 | 229 | F | 160.09 | 60.29 |
229 | 230 | F | 149.42 | 50.35 |
... | ... | ... | ... | ... |
370 | 371 | F | 154.89 | 53.77 |
371 | 372 | F | 155.31 | 49.11 |
372 | 373 | F | 160.31 | 55.68 |
373 | 374 | F | 151.83 | 50.36 |
374 | 375 | F | 151.46 | 51.21 |
375 | 376 | F | 154.98 | 55.09 |
376 | 377 | F | 150.91 | 49.67 |
377 | 378 | F | 153.04 | 56.13 |
378 | 379 | F | 154.76 | 50.25 |
379 | 380 | F | 154.67 | 52.39 |
380 | 381 | F | 158.71 | 57.63 |
381 | 382 | F | 149.25 | 50.68 |
382 | 383 | F | 167.48 | 65.62 |
383 | 384 | F | 159.67 | 58.95 |
384 | 385 | F | 151.79 | 45.20 |
385 | 386 | F | 148.39 | 47.84 |
386 | 387 | F | 158.86 | 53.94 |
387 | 388 | F | 160.72 | 60.88 |
388 | 389 | F | 151.20 | 47.62 |
389 | 390 | F | 151.27 | 44.98 |
390 | 391 | F | 141.49 | 41.07 |
391 | 392 | F | 167.83 | 67.78 |
392 | 393 | F | 156.41 | 52.70 |
393 | 394 | F | 145.48 | 45.13 |
394 | 395 | F | 160.13 | 57.55 |
395 | 396 | F | 163.84 | 59.15 |
396 | 397 | F | 161.23 | 56.51 |
397 | 398 | F | 161.86 | 57.02 |
398 | 399 | F | 150.88 | 50.60 |
399 | 400 | F | 146.28 | 42.26 |
200 rows × 4 columns
body_data.sort("height") # 昇順でソート
id | gender | height | weight | |
---|---|---|---|---|
323 | 324 | F | 135.51 | 33.07 |
269 | 270 | F | 136.59 | 36.07 |
282 | 283 | F | 136.85 | 31.44 |
209 | 210 | F | 137.89 | 40.85 |
281 | 282 | F | 140.53 | 41.28 |
246 | 247 | F | 141.10 | 35.02 |
390 | 391 | F | 141.49 | 41.07 |
324 | 325 | F | 141.62 | 39.77 |
242 | 243 | F | 142.09 | 43.59 |
257 | 258 | F | 143.00 | 38.47 |
251 | 252 | F | 143.17 | 39.01 |
266 | 267 | F | 143.25 | 45.40 |
221 | 222 | F | 143.69 | 45.75 |
227 | 228 | F | 143.75 | 43.67 |
364 | 365 | F | 143.79 | 43.30 |
286 | 287 | F | 144.07 | 42.68 |
244 | 245 | F | 144.47 | 44.17 |
237 | 238 | F | 145.02 | 40.22 |
90 | 91 | M | 145.10 | 50.49 |
260 | 261 | F | 145.48 | 47.71 |
393 | 394 | F | 145.48 | 45.13 |
142 | 143 | M | 145.63 | 55.43 |
247 | 248 | F | 145.80 | 45.37 |
119 | 120 | M | 145.97 | 55.62 |
53 | 54 | M | 146.00 | 50.69 |
399 | 400 | F | 146.28 | 42.26 |
188 | 189 | M | 146.48 | 56.07 |
201 | 202 | F | 146.67 | 43.98 |
208 | 209 | F | 146.92 | 43.73 |
283 | 284 | F | 147.27 | 41.88 |
... | ... | ... | ... | ... |
93 | 94 | M | 171.05 | 70.21 |
113 | 114 | M | 171.11 | 77.74 |
59 | 60 | M | 171.29 | 70.13 |
15 | 16 | M | 171.33 | 76.02 |
126 | 127 | M | 171.40 | 68.27 |
7 | 8 | M | 171.78 | 67.76 |
60 | 61 | M | 171.81 | 68.40 |
70 | 71 | M | 171.86 | 77.23 |
128 | 129 | M | 171.86 | 76.90 |
161 | 162 | M | 171.95 | 69.59 |
97 | 98 | M | 172.19 | 68.83 |
197 | 198 | M | 172.24 | 69.80 |
133 | 134 | M | 172.33 | 66.38 |
82 | 83 | M | 172.43 | 69.58 |
134 | 135 | M | 172.57 | 77.34 |
19 | 20 | M | 172.97 | 70.98 |
328 | 329 | F | 173.76 | 66.17 |
187 | 188 | M | 173.79 | 75.48 |
167 | 168 | M | 174.07 | 72.92 |
109 | 110 | M | 174.10 | 67.76 |
190 | 191 | M | 175.74 | 75.60 |
183 | 184 | M | 176.07 | 74.48 |
38 | 39 | M | 176.24 | 73.56 |
44 | 45 | M | 176.34 | 75.02 |
154 | 155 | M | 177.06 | 74.68 |
116 | 117 | M | 177.17 | 78.99 |
1 | 2 | M | 178.76 | 72.38 |
112 | 113 | M | 178.80 | 69.24 |
124 | 125 | M | 179.17 | 78.56 |
76 | 77 | M | 181.53 | 77.60 |
400 rows × 4 columns
#http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.sort.html
body_data.sort("height", ascending=False) # 降順でソート
id | gender | height | weight | |
---|---|---|---|---|
76 | 77 | M | 181.53 | 77.60 |
124 | 125 | M | 179.17 | 78.56 |
112 | 113 | M | 178.80 | 69.24 |
1 | 2 | M | 178.76 | 72.38 |
116 | 117 | M | 177.17 | 78.99 |
154 | 155 | M | 177.06 | 74.68 |
44 | 45 | M | 176.34 | 75.02 |
38 | 39 | M | 176.24 | 73.56 |
183 | 184 | M | 176.07 | 74.48 |
190 | 191 | M | 175.74 | 75.60 |
109 | 110 | M | 174.10 | 67.76 |
167 | 168 | M | 174.07 | 72.92 |
187 | 188 | M | 173.79 | 75.48 |
328 | 329 | F | 173.76 | 66.17 |
19 | 20 | M | 172.97 | 70.98 |
134 | 135 | M | 172.57 | 77.34 |
82 | 83 | M | 172.43 | 69.58 |
133 | 134 | M | 172.33 | 66.38 |
197 | 198 | M | 172.24 | 69.80 |
97 | 98 | M | 172.19 | 68.83 |
161 | 162 | M | 171.95 | 69.59 |
70 | 71 | M | 171.86 | 77.23 |
128 | 129 | M | 171.86 | 76.90 |
60 | 61 | M | 171.81 | 68.40 |
7 | 8 | M | 171.78 | 67.76 |
126 | 127 | M | 171.40 | 68.27 |
15 | 16 | M | 171.33 | 76.02 |
59 | 60 | M | 171.29 | 70.13 |
113 | 114 | M | 171.11 | 77.74 |
93 | 94 | M | 171.05 | 70.21 |
... | ... | ... | ... | ... |
283 | 284 | F | 147.27 | 41.88 |
208 | 209 | F | 146.92 | 43.73 |
201 | 202 | F | 146.67 | 43.98 |
188 | 189 | M | 146.48 | 56.07 |
399 | 400 | F | 146.28 | 42.26 |
53 | 54 | M | 146.00 | 50.69 |
119 | 120 | M | 145.97 | 55.62 |
247 | 248 | F | 145.80 | 45.37 |
142 | 143 | M | 145.63 | 55.43 |
260 | 261 | F | 145.48 | 47.71 |
393 | 394 | F | 145.48 | 45.13 |
90 | 91 | M | 145.10 | 50.49 |
237 | 238 | F | 145.02 | 40.22 |
244 | 245 | F | 144.47 | 44.17 |
286 | 287 | F | 144.07 | 42.68 |
364 | 365 | F | 143.79 | 43.30 |
227 | 228 | F | 143.75 | 43.67 |
221 | 222 | F | 143.69 | 45.75 |
266 | 267 | F | 143.25 | 45.40 |
251 | 252 | F | 143.17 | 39.01 |
257 | 258 | F | 143.00 | 38.47 |
242 | 243 | F | 142.09 | 43.59 |
324 | 325 | F | 141.62 | 39.77 |
390 | 391 | F | 141.49 | 41.07 |
246 | 247 | F | 141.10 | 35.02 |
281 | 282 | F | 140.53 | 41.28 |
209 | 210 | F | 137.89 | 40.85 |
282 | 283 | F | 136.85 | 31.44 |
269 | 270 | F | 136.59 | 36.07 |
323 | 324 | F | 135.51 | 33.07 |
400 rows × 4 columns
#list4 数値要約(Rのsummary)
body_data.describe ()
id | height | weight | |
---|---|---|---|
count | 400.000000 | 400.000000 | 400.000000 |
mean | 200.500000 | 158.368625 | 58.161525 |
std | 115.614301 | 8.210723 | 9.168719 |
min | 1.000000 | 135.510000 | 31.440000 |
25% | 100.750000 | 152.397500 | 50.930000 |
50% | 200.500000 | 158.240000 | 57.785000 |
75% | 300.250000 | 163.862500 | 65.525000 |
max | 400.000000 | 181.530000 | 78.990000 |
# list5
body_data.height.std() # 標準偏差
8.2107232507360379
body_data.weight.var() # 不偏分散
84.065404936723198
# list6 ヒストグラム
plt.hist(body_data.height, bins=30)
plt.xlabel("height")
plt.ylabel("count")
plt.show()
# list7 男女別に色分けしたヒストグラム
body_data_f = body_data[body_data.gender == "F"]
plt.hist(body_data.height.values, bins=30, color="cyan")
plt.hist(body_data_f.height.values, bins=30, color="magenta")
plt.xlabel("height")
plt.ylabel("count")
plt.legend(["M", "F"], title="gender")
plt.show()
# sample 若干オプションをいじってみる
# http://matplotlib.org/api/pyplot_api.html?highlight=hist#matplotlib.pyplot.hist
body_data_m = body_data[body_data.gender == "M"]
body_data_f = body_data[body_data.gender == "F"]
plt.hist(body_data.height.values, bins=30, color="yellow", alpha=.2)
plt.hist(body_data_m.height.values, bins=30, color="cyan", alpha=.5)
plt.hist(body_data_f.height.values, bins=30, color="magenta", alpha=.5)
plt.xlabel("height")
plt.ylabel("count")
plt.legend(["all", "M", "F"], title="gender")
plt.show()
# list8 身長データの箱ひげ図
body_data_f = body_data[body_data.gender == "F"]
body_data_m = body_data[body_data.gender == "M"]
plt.xlabel("gender")
plt.boxplot([body_data_f.height, body_data_m.height])
plt.xticks([1, 2], ['F', 'M'])
plt.show()
# list9 身長と体重の散布図
plt.scatter(body_data.height, body_data.weight, s=1)
plt.xlabel("height")
plt.ylabel("weight")
plt.show()
# list10 身長と体重の散布図+回帰直線
from sklearn import linear_model
LinerRegr = linear_model.LinearRegression()
x = body_data[["height"]]
y = body_data[["weight"]]
LinerRegr.fit(x, y)
plt.scatter(x, y, s=1)
px = np.arange(x.min(), x.max(), .01)[:,np.newaxis]
py = LinerRegr.predict(px)
plt.plot(px, py, color="blue", linewidth=2)
plt.xlabel("height")
plt.ylabel("weight")
plt.show()
# list11 男女別の身長と体重の散布図+回帰直線
from sklearn import linear_model
body_data_f = body_data[body_data.gender == "F"]
body_data_m = body_data[body_data.gender == "M"]
x = body_data[["height"]]
y = body_data[["weight"]]
plt.scatter(x, y, s=1)
def plot_linear_regression(data, x_variable="height", y_variable="weight", color="black", linewidth=2):
x = data[[x_variable]]
y = data[[y_variable]]
liner_regr = linear_model.LinearRegression()
liner_regr.fit(x, y)
px = np.arange(x.min(), x.max(), .01)[:,np.newaxis]
py = liner_regr.predict(px)
plt.plot(px, py, color=color, linewidth=linewidth)
plot_linear_regression(data=body_data_m, color="cyan")
plot_linear_regression(data=body_data_f, color="pink")
plt.legend(["M", "F"], title="gender", loc='lower right')
plt.xlabel("height")
plt.ylabel("weight")
plt.show()
# list12 相関係数の算出
## 全体
corr_of_all = np.corrcoef(body_data.height, body_data.weight)
corr_of_all[0, 1]
0.8928747678201534
## 男性
body_data_m = body_data[body_data.gender == "M"]
corr_of_m = np.corrcoef(body_data_m.height, body_data_m.weight)
corr_of_m[0, 1]
0.86345697571511137
## 女性
body_data_f = body_data[body_data.gender == "F"]
corr_of_f = np.corrcoef(body_data_f.height, body_data_f.weight)
corr_of_f[0, 1]
0.91735993883788636