In [30]:
using DataFrames
using GLM
using Gadfly
In [102]:
data = readtable("/Users/ysekky/Downloads/data4a.csv")
Out[102]:
Nyxf
1819.76C
28610.48C
38510.83C
48610.94C
5819.37C
6818.81C
7839.49C
88611.02C
9807.97C
108811.55C
11809.46C
12829.47C
13808.71C
148510.42C
158310.06C
168611.0C
17839.95C
18849.52C
198510.26C
208811.33C
21859.77C
228810.59C
23819.35C
248410.0C
25819.53C
268812.06C
27849.68C
288711.32C
298510.48C
308510.37C
In [103]:
#Fをカテゴリカル変数が扱えるようにPooledDataArrayに変換する
data[:f] = convert(PooledDataArray,data[:f])
describe(data)
N
Min      8.0
1st Qu.  8.0
Median   8.0
Mean     8.0
3rd Qu.  8.0
Max      8.0
NAs      0
NA%      0.0%

y
Min      0.0
1st Qu.  3.0
Median   6.0
Mean     5.08
3rd Qu.  8.0
Max      8.0
NAs      0
NA%      0.0%

x
Min      7.66
1st Qu.  9.3375
Median   9.965
Mean     9.967199999999998
3rd Qu.  10.77
Max      12.44
NAs      0
NA%      0.0%

f
Length  100
Type    UTF8String
NAs     0
NA%     0.0%
Unique  2

In [104]:
plot(data, x="x", y="y", color="f")
Out[104]:
x 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 0.0 0.2 0.4 0.6 0.8 1.0 1.2 1.4 1.6 1.8 2.0 2.2 2.4 2.6 2.8 3.0 3.2 3.4 3.6 3.8 4.0 4.2 4.4 4.6 4.8 5.0 5.2 5.4 5.6 5.8 6.0 6.2 6.4 6.6 6.8 7.0 7.2 7.4 7.6 7.8 8.0 8.2 8.4 8.6 8.8 9.0 9.2 9.4 9.6 9.8 10.0 10.2 10.4 10.6 10.8 11.0 11.2 11.4 11.6 11.8 12.0 12.2 12.4 12.6 12.8 13.0 13.2 13.4 13.6 13.8 14.0 14.2 14.4 14.6 14.8 15.0 15.2 15.4 15.6 15.8 16.0 16.2 16.4 16.6 16.8 17.0 17.2 17.4 17.6 17.8 18.0 18.2 18.4 18.6 18.8 19.0 0 5 10 15 20 0.0 0.5 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0 5.5 6.0 6.5 7.0 7.5 8.0 8.5 9.0 9.5 10.0 10.5 11.0 11.5 12.0 12.5 13.0 13.5 14.0 14.5 15.0 15.5 16.0 16.5 17.0 17.5 18.0 18.5 19.0 C T f -10 -8 -6 -4 -2 0 2 4 6 8 10 12 14 16 18 -8.0 -7.5 -7.0 -6.5 -6.0 -5.5 -5.0 -4.5 -4.0 -3.5 -3.0 -2.5 -2.0 -1.5 -1.0 -0.5 0.0 0.5 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0 5.5 6.0 6.5 7.0 7.5 8.0 8.5 9.0 9.5 10.0 10.5 11.0 11.5 12.0 12.5 13.0 13.5 14.0 14.5 15.0 15.5 16.0 -10 0 10 20 -8.0 -7.5 -7.0 -6.5 -6.0 -5.5 -5.0 -4.5 -4.0 -3.5 -3.0 -2.5 -2.0 -1.5 -1.0 -0.5 0.0 0.5 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0 5.5 6.0 6.5 7.0 7.5 8.0 8.5 9.0 9.5 10.0 10.5 11.0 11.5 12.0 12.5 13.0 13.5 14.0 14.5 15.0 15.5 16.0 y
In [41]:
logistic(z) = 1/(1+exp(-z))
plot(logistic,-6, 6)
Out[41]:
x -20 -18 -16 -14 -12 -10 -8 -6 -4 -2 0 2 4 6 8 10 12 14 16 18 20 -18.0 -17.5 -17.0 -16.5 -16.0 -15.5 -15.0 -14.5 -14.0 -13.5 -13.0 -12.5 -12.0 -11.5 -11.0 -10.5 -10.0 -9.5 -9.0 -8.5 -8.0 -7.5 -7.0 -6.5 -6.0 -5.5 -5.0 -4.5 -4.0 -3.5 -3.0 -2.5 -2.0 -1.5 -1.0 -0.5 0.0 0.5 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0 5.5 6.0 6.5 7.0 7.5 8.0 8.5 9.0 9.5 10.0 10.5 11.0 11.5 12.0 12.5 13.0 13.5 14.0 14.5 15.0 15.5 16.0 16.5 17.0 17.5 18.0 -20 -10 0 10 20 -18 -17 -16 -15 -14 -13 -12 -11 -10 -9 -8 -7 -6 -5 -4 -3 -2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 -1.5 -1.0 -0.5 0.0 0.5 1.0 1.5 2.0 2.5 -1.00 -0.95 -0.90 -0.85 -0.80 -0.75 -0.70 -0.65 -0.60 -0.55 -0.50 -0.45 -0.40 -0.35 -0.30 -0.25 -0.20 -0.15 -0.10 -0.05 0.00 0.05 0.10 0.15 0.20 0.25 0.30 0.35 0.40 0.45 0.50 0.55 0.60 0.65 0.70 0.75 0.80 0.85 0.90 0.95 1.00 1.05 1.10 1.15 1.20 1.25 1.30 1.35 1.40 1.45 1.50 1.55 1.60 1.65 1.70 1.75 1.80 1.85 1.90 1.95 2.00 -1 0 1 2 -1.0 -0.9 -0.8 -0.7 -0.6 -0.5 -0.4 -0.3 -0.2 -0.1 0.0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0 1.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 2.0 f(x)
In [92]:
 
N
Min      8.0
1st Qu.  8.0
Median   8.0
Mean     8.0
3rd Qu.  8.0
Max      8.0
NAs      0
NA%      0.0%

y
Min      0.0
1st Qu.  3.0
Median   6.0
Mean     5.08
3rd Qu.  8.0
Max      8.0
NAs      0
NA%      0.0%

x
Min      7.66
1st Qu.  9.3375
Median   9.965
Mean     9.967199999999998
3rd Qu.  10.77
Max      12.44
NAs      0
NA%      0.0%

f
Length  100
Type    Pooled UTF8String
NAs     0
NA%     0.0%
Unique  2

p
Min      0.0
1st Qu.  0.375
Median   0.75
Mean     0.635
3rd Qu.  1.0
Max      1.0
NAs      0
NA%      0.0%

In [100]:
#6.5でも言われている通り本来はこれで動作させたい
data[:t] = hcat(data[:y], data[:N] - data[:y])
glm(t~x+f , data, Binomial(), LogitLink())
ArgumentError("setindex!(::DataFrame, ...) only broadcasts scalars, not arrays")
while loading In[100], in expression starting on line 1

 in setindex! at /Users/ysekky/.julia/DataFrames/src/dataframe/dataframe.jl:359
In [101]:
#一応これでも同じ結果は得られるが, これはNがすべて共通だから成立するものと思われる
data[:p] = data[:y]/8.0
glm(p~x+f , data, Binomial(), LogitLink())
Out[101]:
DataFrameRegressionModel{GeneralizedLinearModel,Float64}:

Coefficients:
             Estimate Std.Error  z value Pr(>|z|)
(Intercept)  -19.5361   3.99861 -4.88572    <1e-5
x             1.95241  0.392777  4.97077    <1e-6
f - T         2.02151  0.654152  3.09027   0.0020
In [105]:
#カテゴリカル変数と実数値を演算にした線形予測子は用いることができないようだ
glm(p~x+f+x*f , data, Binomial(), LogitLink())
key not found: :p
while loading In[105], in expression starting on line 1

 in getindex at /Users/ysekky/.julia/DataFrames/src/other/index.jl:105
 in getindex at /Users/ysekky/.julia/DataFrames/src/dataframe/dataframe.jl:232
 in anonymous at /Users/ysekky/.julia/DataFrames/src/statsmodels/formula.jl:186
 in map at ./base.jl:184
 in ModelFrame at /Users/ysekky/.julia/DataFrames/src/statsmodels/formula.jl:186
 in fit at /Users/ysekky/.julia/DataFrames/src/statsmodels/statsmodel.jl:52
 in glm at /Users/ysekky/.julia/GLM/src/glmfit.jl:170
In [108]:
#こっちも無理
data[:x] * data[:f]
`*` has no method matching *(::Array{Float64,1}, ::Array{UTF8String,1})
while loading In[108], in expression starting on line 2

 in * at /Users/ysekky/.julia/DataArrays/src/operators.jl:416
In [ ]: