using Gadfly
using DataFrames
using DecisionTree
function cleanData(filename)
df=readtable(filename)
pool!(df,[:Sex])
pool!(df,[:Pclass])
averageAge=mean(df[!isna(df[:Age]),:Age])
df[:Age]=array(df[:Age],averageAge)
if any(isna(df[:Fare]))
averageFare=mean(df[!isna(df[:Fare]),:Fare])
df[:Fare]=array(df[:Fare],averageFare)
end
df[:Embarked]=array(df[:Embarked],utf8("S"))
pool!(df,[:Embarked])
newdata=df[:,[:Pclass,:Age,:Sex,:SibSp,:Parch,:Fare,:Embarked]]
iddata=df[:,[:PassengerId]]
return newdata,iddata
end
cleanData (generic function with 1 method)
function readylabel(filename)
df=readtable(filename)
pool!(df,[:Survived])
return df[:Survived]
end
readylabel (generic function with 1 method)
xTrain,idTrain=cleanData("train.csv")
xTest,idTest=cleanData("test.csv")
yTrain=readylabel("train.csv")
891-element PooledDataArray{Int64,Uint8,1}: 0 1 1 1 0 0 0 0 1 1 1 1 0 ⋮ 1 1 0 0 0 0 0 0 1 0 1 0
describe(xTest)
Pclass Min 1.0 1st Qu. 1.0 Median 3.0 Mean 2.2655502392344498 3rd Qu. 3.0 Max 3.0 NAs 0 NA% 0.0% Age Min 0.17 1st Qu. 23.0 Median 30.272590361445783 Mean 30.272590361445793 3rd Qu. 35.75 Max 76.0 NAs 0 NA% 0.0% Sex Length 418 Type Pooled UTF8String NAs 0 NA% 0.0% Unique 2 SibSp Min 0.0 1st Qu. 0.0 Median 0.0 Mean 0.4473684210526316 3rd Qu. 1.0 Max 8.0 NAs 0 NA% 0.0% Parch Min 0.0 1st Qu. 0.0 Median 0.0 Mean 0.3923444976076555 3rd Qu. 0.0 Max 9.0 NAs 0 NA% 0.0% Fare Min 0.0 1st Qu. 7.8958 Median 14.4542 Mean 35.627188489208635 3rd Qu. 31.5 Max 512.3292 NAs 0 NA% 0.0% Embarked Length 418 Type Pooled UTF8String NAs 0 NA% 0.0% Unique 3
yTrain=array(yTrain)
xTrain=array(xTrain)
xTest=array(xTest)
418x7 Array{Any,2}: 3 34.5 "male" 0 0 7.8292 "Q" 3 47.0 "female" 1 0 7.0 "S" 2 62.0 "male" 0 0 9.6875 "Q" 3 27.0 "male" 0 0 8.6625 "S" 3 22.0 "female" 1 1 12.2875 "S" 3 14.0 "male" 0 0 9.225 "S" 3 30.0 "female" 0 0 7.6292 "Q" 2 26.0 "male" 1 1 29.0 "S" 3 18.0 "female" 0 0 7.2292 "C" 3 21.0 "male" 2 0 24.15 "S" 3 30.2726 "male" 0 0 7.8958 "S" 1 46.0 "male" 0 0 26.0 "S" 1 23.0 "female" 1 0 82.2667 "S" ⋮ ⋮ 2 23.0 "male" 1 0 10.5 "S" 1 50.0 "male" 1 1 211.5 "C" 3 30.2726 "female" 0 0 7.7208 "Q" 3 3.0 "female" 1 1 13.775 "S" 3 30.2726 "female" 0 0 7.75 "Q" 1 37.0 "female" 1 0 90.0 "Q" 3 28.0 "female" 0 0 7.775 "S" 3 30.2726 "male" 0 0 8.05 "S" 1 39.0 "female" 0 0 108.9 "C" 3 38.5 "male" 0 0 7.25 "S" 3 30.2726 "male" 0 0 8.05 "S" 3 30.2726 "male" 1 1 22.3583 "C"
accuracy = nfoldCV_forest(yTrain, xTrain, 5, 20, 4, 0.7);
Fold 1 Classes: {0,1} Matrix: [115 14 27 66] Accuracy: 0.8153153153153153 Kappa: 0.6131089007906146 Fold 2 Classes: {0,1} Matrix: [122 17 26 57] Accuracy: 0.8063063063063063 Kappa: 0.5770491803278688 Fold 3 Classes: {0,1} Matrix: [115 18 24 65] Accuracy: 0.8108108108108109 Kappa: 0.6017086715079027 Fold 4 Classes: {0,1} Matrix: [125 21 20 56] Accuracy: 0.8153153153153153 Kappa: 0.591141856077621 Mean Accuracy: 0.8119369369369369
model = build_forest(yTrain, xTrain, 5, 20, 0.7)
Ensemble of Decision Trees Trees: 20 Avg Leaves: 108.9 Avg Depth: 18.75
predy=apply_forest(model,xTest)
418-element Array{Any,1}: 0 0 0 0 0 0 1 0 1 0 0 0 1 ⋮ 0 0 1 1 1 1 1 0 1 0 0 1
predydf=DataArray(Survived=predy)
predydf=int(predydf[:,1])
function DataArray does not accept keyword arguments while loading In[10], in expression starting on line 1
resultdf=[idTest predydf]
predydf not defined while loading In[11], in expression starting on line 1
writedlm("output.csv",[array(idTest) predy],',')
convert(Int64,"3")
`convert` has no method matching convert(::Type{Int64}, ::ASCIIString) while loading In[14], in expression starting on line 1 in convert at base.jl:13
int("3")
3
typeof("3")==ASCIIString
true