using CombineML.Util
using CombineML.Transformers
import RDatasets
iris = RDatasets.dataset("datasets", "iris")
X = convert(Array, iris[[:SepalLength, :SepalWidth, :PetalLength, :PetalWidth]])
y = convert(Array, iris[:Species]);
# Split into training and test sets
(train_ind, test_ind) = holdout(size(X, 1), 0.3)
([76, 1, 118, 36, 102, 132, 28, 108, 90, 147 … 17, 113, 88, 77, 85, 47, 61, 144, 54, 60], [39, 68, 111, 24, 20, 114, 8, 52, 142, 44 … 53, 104, 93, 122, 46, 25, 30, 80, 23, 32])
prunedTreeLearner = PrunedTree()
CombineML.Transformers.DecisionTreeWrapper.PrunedTree(nothing, Dict{Symbol,Any}(Pair{Symbol,Any}(:output, :class),Pair{Symbol,Any}(:impl_options, Dict(:purity_threshold=>1.0))))
pipeline = Pipeline(Dict(
:transformers => [
OneHotEncoder(), # Encodes nominal features into numeric
Imputer(), # Imputes NA values
#StandardScaler(), # Standardizes features
prunedTreeLearner # Predicts labels on instances
]
))
CombineML.Transformers.CombineMLTransformers.Pipeline(nothing, Dict{Symbol,Any}(Pair{Symbol,Any}(:transformers, CombineML.Types.Transformer[CombineML.Transformers.CombineMLTransformers.OneHotEncoder(nothing, Dict(:nominal_column_values_map=>nothing,:nominal_columns=>nothing)), CombineML.Transformers.CombineMLTransformers.Imputer(nothing, Dict(:strategy=>mean)), CombineML.Transformers.DecisionTreeWrapper.PrunedTree(nothing, Dict{Symbol,Any}(Pair{Symbol,Any}(:output, :class),Pair{Symbol,Any}(:impl_options, Dict(:purity_threshold=>1.0))))]),Pair{Symbol,Any}(:transformer_options, nothing)))
# Train
fit!(pipeline, X[train_ind, :], y[train_ind]);
# Predict
predictions = transform!(pipeline, X[test_ind, :]);
sum(predictions .== y[test_ind])/length(predictions)*100
97.77777777777777
result = score(:accuracy, y[test_ind], predictions)
println(result)
97.77777777777777
function processModel(learner)
iris = RDatasets.dataset("datasets", "iris")
X = convert(Array, iris[[:SepalLength, :SepalWidth, :PetalLength, :PetalWidth]])
y = convert(Array, iris[:Species]);
(train_ind, test_ind) = holdout(size(X, 1), 0.3)
pipeline = Pipeline(Dict(
:transformers => [
OneHotEncoder(), # Encodes nominal features into numeric
Imputer(), # Imputes NA values
#StandardScaler(), # Standardizes features
learner # Predicts labels on instances
]
))
# Train
fit!(pipeline, X[train_ind, :], y[train_ind]);
# Predict
predictions = transform!(pipeline, X[test_ind, :]);
result = score(:accuracy, y[test_ind], predictions)
return(result)
end
processModel (generic function with 1 method)
adaLearner = DecisionStumpAdaboost(Dict(
# Output to train against
# (:class).
:output => :class,
# Options specific to this implementation.
:impl_options => Dict(
# Number of boosting iterations.
:num_iterations => 7
)
))
processModel(adaLearner)
64.44444444444444
rfLearner = RandomForest(Dict(
:output => :class,
:impl_options => Dict(
:num_subfeatures => nothing,
:num_trees => 10,
:partial_sampling => 0.7
)
))
processModel(rfLearner)
93.33333333333333
using ScikitLearn
@sk_import neighbors: KNeighborsClassifier
@sk_import svm: SVC
skLearner = SKLLearner(Dict(
:output => :class,
#:learner => "KNeighborsClassifier",
:learner => "SVC",
:impl_options => Dict()
))
processModel(skLearner)
ArgumentError: Module ScikitLearn not found in current path. Run `Pkg.add("ScikitLearn")` to install the ScikitLearn package. Stacktrace: [1] _require(::Symbol) at ./loading.jl:435 [2] require(::Symbol) at ./loading.jl:405
voteLearner = VoteEnsemble(Dict(
:output => :class,
# Learners in voting committee.
:learners => [RandomForest(),PrunedTree(), DecisionStumpAdaboost()]
))
processModel(voteLearner)
97.77777777777777
bestLearner = BestLearner(Dict(
:output => :class,
:partition_generator => (X, y) -> kfold(size(X, 1), 5),
:selection_function => (learner_partition_scores) -> findmax(mean(learner_partition_scores, 2))[2],
:score_type => Real,
:learners => [PrunedTree(), DecisionStumpAdaboost(), RandomForest()],
:learner_options_grid => nothing
))
processModel(bestLearner)
97.77777777777777
stackLearner = StackEnsemble(Dict(
:output => :class,
:learners => [PrunedTree(), DecisionStumpAdaboost(), RandomForest(),voteLearner,bestLearner],
:stacker => RandomForest(),
# Proportion of training set left to train stacker itself.
:stacker_training_proportion => 0.3,
:keep_original_features => false
))
processModel(stackLearner)
95.55555555555556
results=@parallel (vcat) for i=1:30
processModel(stackLearner)
end
println("acc = ",round(mean(results))," +/- ",round(std(results)))
acc = 94.0 +/- 4.0
results
30-element Array{Float64,1}: 100.0 93.3333 95.5556 88.8889 91.1111 93.3333 93.3333 95.5556 97.7778 95.5556 88.8889 93.3333 91.1111 ⋮ 97.7778 93.3333 95.5556 84.4444 95.5556 93.3333 93.3333 95.5556 97.7778 93.3333 95.5556 91.1111
#svmcrt = CRTLearner(Dict(
# Output to train against
# (:class).
#:output => :class,
#:learner => "rf",
#:learner => "svmLinear2",
#:learner => "rpart",
#:learner => "lda",
#:impl_options => Dict()
#))