using DataFrames
using GLM
using Gadfly
data = readtable("/Users/ysekky/Downloads/data4a.csv")
N | y | x | f | |
---|---|---|---|---|
1 | 8 | 1 | 9.76 | C |
2 | 8 | 6 | 10.48 | C |
3 | 8 | 5 | 10.83 | C |
4 | 8 | 6 | 10.94 | C |
5 | 8 | 1 | 9.37 | C |
6 | 8 | 1 | 8.81 | C |
7 | 8 | 3 | 9.49 | C |
8 | 8 | 6 | 11.02 | C |
9 | 8 | 0 | 7.97 | C |
10 | 8 | 8 | 11.55 | C |
11 | 8 | 0 | 9.46 | C |
12 | 8 | 2 | 9.47 | C |
13 | 8 | 0 | 8.71 | C |
14 | 8 | 5 | 10.42 | C |
15 | 8 | 3 | 10.06 | C |
16 | 8 | 6 | 11.0 | C |
17 | 8 | 3 | 9.95 | C |
18 | 8 | 4 | 9.52 | C |
19 | 8 | 5 | 10.26 | C |
20 | 8 | 8 | 11.33 | C |
21 | 8 | 5 | 9.77 | C |
22 | 8 | 8 | 10.59 | C |
23 | 8 | 1 | 9.35 | C |
24 | 8 | 4 | 10.0 | C |
25 | 8 | 1 | 9.53 | C |
26 | 8 | 8 | 12.06 | C |
27 | 8 | 4 | 9.68 | C |
28 | 8 | 7 | 11.32 | C |
29 | 8 | 5 | 10.48 | C |
30 | 8 | 5 | 10.37 | C |
⋮ | ⋮ | ⋮ | ⋮ | ⋮ |
#Fをカテゴリカル変数が扱えるようにPooledDataArrayに変換する
data[:f] = convert(PooledDataArray,data[:f])
describe(data)
N Min 8.0 1st Qu. 8.0 Median 8.0 Mean 8.0 3rd Qu. 8.0 Max 8.0 NAs 0 NA% 0.0% y Min 0.0 1st Qu. 3.0 Median 6.0 Mean 5.08 3rd Qu. 8.0 Max 8.0 NAs 0 NA% 0.0% x Min 7.66 1st Qu. 9.3375 Median 9.965 Mean 9.967199999999998 3rd Qu. 10.77 Max 12.44 NAs 0 NA% 0.0% f Length 100 Type UTF8String NAs 0 NA% 0.0% Unique 2
plot(data, x="x", y="y", color="f")
logistic(z) = 1/(1+exp(-z))
plot(logistic,-6, 6)
N Min 8.0 1st Qu. 8.0 Median 8.0 Mean 8.0 3rd Qu. 8.0 Max 8.0 NAs 0 NA% 0.0% y Min 0.0 1st Qu. 3.0 Median 6.0 Mean 5.08 3rd Qu. 8.0 Max 8.0 NAs 0 NA% 0.0% x Min 7.66 1st Qu. 9.3375 Median 9.965 Mean 9.967199999999998 3rd Qu. 10.77 Max 12.44 NAs 0 NA% 0.0% f Length 100 Type Pooled UTF8String NAs 0 NA% 0.0% Unique 2 p Min 0.0 1st Qu. 0.375 Median 0.75 Mean 0.635 3rd Qu. 1.0 Max 1.0 NAs 0 NA% 0.0%
#6.5でも言われている通り本来はこれで動作させたい
data[:t] = hcat(data[:y], data[:N] - data[:y])
glm(t~x+f , data, Binomial(), LogitLink())
ArgumentError("setindex!(::DataFrame, ...) only broadcasts scalars, not arrays") while loading In[100], in expression starting on line 1 in setindex! at /Users/ysekky/.julia/DataFrames/src/dataframe/dataframe.jl:359
#一応これでも同じ結果は得られるが, これはNがすべて共通だから成立するものと思われる
data[:p] = data[:y]/8.0
glm(p~x+f , data, Binomial(), LogitLink())
DataFrameRegressionModel{GeneralizedLinearModel,Float64}: Coefficients: Estimate Std.Error z value Pr(>|z|) (Intercept) -19.5361 3.99861 -4.88572 <1e-5 x 1.95241 0.392777 4.97077 <1e-6 f - T 2.02151 0.654152 3.09027 0.0020
#カテゴリカル変数と実数値を演算にした線形予測子は用いることができないようだ
glm(p~x+f+x*f , data, Binomial(), LogitLink())
key not found: :p while loading In[105], in expression starting on line 1 in getindex at /Users/ysekky/.julia/DataFrames/src/other/index.jl:105 in getindex at /Users/ysekky/.julia/DataFrames/src/dataframe/dataframe.jl:232 in anonymous at /Users/ysekky/.julia/DataFrames/src/statsmodels/formula.jl:186 in map at ./base.jl:184 in ModelFrame at /Users/ysekky/.julia/DataFrames/src/statsmodels/formula.jl:186 in fit at /Users/ysekky/.julia/DataFrames/src/statsmodels/statsmodel.jl:52 in glm at /Users/ysekky/.julia/GLM/src/glmfit.jl:170
#こっちも無理
data[:x] * data[:f]
`*` has no method matching *(::Array{Float64,1}, ::Array{UTF8String,1}) while loading In[108], in expression starting on line 2 in * at /Users/ysekky/.julia/DataArrays/src/operators.jl:416