#  Python でやってみる『"データサイエンティスト養成読本" 特集1 第1章 Rで統計解析を始めよう』

# list1
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# list2
## サンプルデータ
## http://gihyo.jp/book/2013/978-4-7741-5896-9/support
body_data = pd.read_csv("body_sample.csv")
body_data.info()

# list3 データの基本操作

body_data[[1]]  # 列番号を指定して取得

body_data[[0, 2]]  # 複数の列番号を指定して取得

body_data["weight"]  #  列名で取得

body_data.weight  # .列名で取得

body_data[["id", "height"]]  # 複数の列名で取得

body_data[body_data.gender == "F"]  # 条件にあった行だけ取り出す

body_data.sort("height")  # 昇順でソート

#http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.sort.html
body_data.sort("height", ascending=False)  # 降順でソート

#list4  数値要約（Rのsummary）
body_data.describe ()

# list5

body_data.height.std()  # 標準偏差

body_data.weight.var()  # 不偏分散

# list6 ヒストグラム
plt.hist(body_data.height, bins=30)
plt.xlabel("height")
plt.ylabel("count")
plt.show()

# list7 男女別に色分けしたヒストグラム
body_data_f = body_data[body_data.gender == "F"]
plt.hist(body_data.height.values, bins=30, color="cyan")
plt.hist(body_data_f.height.values, bins=30, color="magenta")
plt.xlabel("height")
plt.ylabel("count")
plt.legend(["M", "F"], title="gender")
plt.show()

# sample 若干オプションをいじってみる
# http://matplotlib.org/api/pyplot_api.html?highlight=hist#matplotlib.pyplot.hist
body_data_m = body_data[body_data.gender == "M"]
body_data_f = body_data[body_data.gender == "F"]
plt.hist(body_data.height.values, bins=30, color="yellow", alpha=.2)
plt.hist(body_data_m.height.values, bins=30, color="cyan", alpha=.5)
plt.hist(body_data_f.height.values, bins=30, color="magenta", alpha=.5)
plt.xlabel("height")
plt.ylabel("count")
plt.legend(["all", "M", "F"], title="gender")
plt.show()

# list8 身長データの箱ひげ図
body_data_f = body_data[body_data.gender == "F"]
body_data_m = body_data[body_data.gender == "M"]
plt.xlabel("gender")
plt.boxplot([body_data_f.height, body_data_m.height])
plt.xticks([1, 2], ['F', 'M'])
plt.show()

# list9 身長と体重の散布図
plt.scatter(body_data.height, body_data.weight, s=1)
plt.xlabel("height")
plt.ylabel("weight")
plt.show()

# list10 身長と体重の散布図+回帰直線
from sklearn import linear_model
LinerRegr = linear_model.LinearRegression()
x = body_data[["height"]]
y = body_data[["weight"]]
LinerRegr.fit(x, y)
plt.scatter(x, y, s=1)
px = np.arange(x.min(), x.max(), .01)[:,np.newaxis]
py = LinerRegr.predict(px)
plt.plot(px, py, color="blue", linewidth=2)
plt.xlabel("height")
plt.ylabel("weight")
plt.show()

# list11 男女別の身長と体重の散布図+回帰直線
from sklearn import linear_model
body_data_f = body_data[body_data.gender == "F"]
body_data_m = body_data[body_data.gender == "M"]

x = body_data[["height"]]
y = body_data[["weight"]]
plt.scatter(x, y, s=1)

def plot_linear_regression(data, x_variable="height", y_variable="weight", color="black", linewidth=2):
    x = data[[x_variable]]
    y = data[[y_variable]]
    liner_regr = linear_model.LinearRegression()
    liner_regr.fit(x, y)
    px = np.arange(x.min(), x.max(), .01)[:,np.newaxis]
    py = liner_regr.predict(px)
    plt.plot(px, py, color=color, linewidth=linewidth)

plot_linear_regression(data=body_data_m, color="cyan")
plot_linear_regression(data=body_data_f, color="pink")

plt.legend(["M", "F"], title="gender", loc='lower right')
plt.xlabel("height")
plt.ylabel("weight")
plt.show()

# list12 相関係数の算出

## 全体
corr_of_all = np.corrcoef(body_data.height, body_data.weight)
corr_of_all[0, 1]

## 男性
body_data_m = body_data[body_data.gender == "M"]
corr_of_m = np.corrcoef(body_data_m.height, body_data_m.weight)
corr_of_m[0, 1]

## 女性
body_data_f = body_data[body_data.gender == "F"]
corr_of_f = np.corrcoef(body_data_f.height, body_data_f.weight)
corr_of_f[0, 1]