In [3]:

using CombineML.Util
using CombineML.Transformers
import RDatasets

In [4]:

iris = RDatasets.dataset("datasets", "iris")
X = convert(Array, iris[[:SepalLength, :SepalWidth, :PetalLength, :PetalWidth]])
y = convert(Array, iris[:Species]);

# Split into training and test sets
(train_ind, test_ind) = holdout(size(X, 1), 0.3)

Out[4]:

([76, 1, 118, 36, 102, 132, 28, 108, 90, 147  …  17, 113, 88, 77, 85, 47, 61, 144, 54, 60], [39, 68, 111, 24, 20, 114, 8, 52, 142, 44  …  53, 104, 93, 122, 46, 25, 30, 80, 23, 32])

In [5]:

prunedTreeLearner = PrunedTree()

Out[5]:

CombineML.Transformers.DecisionTreeWrapper.PrunedTree(nothing, Dict{Symbol,Any}(Pair{Symbol,Any}(:output, :class),Pair{Symbol,Any}(:impl_options, Dict(:purity_threshold=>1.0))))

In [6]:

pipeline = Pipeline(Dict(
        :transformers => [
            OneHotEncoder(), # Encodes nominal features into numeric
            Imputer(), # Imputes NA values
            #StandardScaler(), # Standardizes features 
            prunedTreeLearner # Predicts labels on instances
        ]
    ))

Out[6]:

CombineML.Transformers.CombineMLTransformers.Pipeline(nothing, Dict{Symbol,Any}(Pair{Symbol,Any}(:transformers, CombineML.Types.Transformer[CombineML.Transformers.CombineMLTransformers.OneHotEncoder(nothing, Dict(:nominal_column_values_map=>nothing,:nominal_columns=>nothing)), CombineML.Transformers.CombineMLTransformers.Imputer(nothing, Dict(:strategy=>mean)), CombineML.Transformers.DecisionTreeWrapper.PrunedTree(nothing, Dict{Symbol,Any}(Pair{Symbol,Any}(:output, :class),Pair{Symbol,Any}(:impl_options, Dict(:purity_threshold=>1.0))))]),Pair{Symbol,Any}(:transformer_options, nothing)))

In [7]:

# Train
fit!(pipeline, X[train_ind, :], y[train_ind]);

In [8]:

# Predict
predictions = transform!(pipeline, X[test_ind, :]);

In [9]:

sum(predictions .== y[test_ind])/length(predictions)*100

Out[9]:

97.77777777777777

In [10]:

result = score(:accuracy, y[test_ind], predictions)
println(result)

97.77777777777777

In [11]:

function processModel(learner)
    iris = RDatasets.dataset("datasets", "iris")
    X = convert(Array, iris[[:SepalLength, :SepalWidth, :PetalLength, :PetalWidth]])
    y = convert(Array, iris[:Species]);
    (train_ind, test_ind) = holdout(size(X, 1), 0.3)
    pipeline = Pipeline(Dict(
            :transformers => [
                OneHotEncoder(), # Encodes nominal features into numeric
                Imputer(), # Imputes NA values
                #StandardScaler(), # Standardizes features 
                learner # Predicts labels on instances
            ]
        ))
    # Train
    fit!(pipeline, X[train_ind, :], y[train_ind]);
    # Predict
    predictions = transform!(pipeline, X[test_ind, :]);
    result = score(:accuracy, y[test_ind], predictions)
    return(result)
end

Out[11]:

processModel (generic function with 1 method)

In [12]:

adaLearner = DecisionStumpAdaboost(Dict(
  # Output to train against
  # (:class).
  :output => :class,
  # Options specific to this implementation.
  :impl_options => Dict(
    # Number of boosting iterations.
    :num_iterations => 7
  )
))
processModel(adaLearner)

Out[12]:

64.44444444444444

In [13]:

rfLearner = RandomForest(Dict(
  :output => :class,
  :impl_options => Dict(
    :num_subfeatures => nothing,
    :num_trees => 10,
    :partial_sampling => 0.7
  )
))
processModel(rfLearner)

Out[13]:

93.33333333333333

In [14]:

using ScikitLearn
@sk_import neighbors: KNeighborsClassifier
@sk_import svm: SVC

skLearner = SKLLearner(Dict(
  :output => :class,
  #:learner => "KNeighborsClassifier",
  :learner => "SVC",
  :impl_options => Dict()
))
processModel(skLearner)

ArgumentError: Module ScikitLearn not found in current path.
Run `Pkg.add("ScikitLearn")` to install the ScikitLearn package.

Stacktrace:
 [1] _require(::Symbol) at ./loading.jl:435
 [2] require(::Symbol) at ./loading.jl:405

In [15]:

voteLearner = VoteEnsemble(Dict(
  :output => :class,
  # Learners in voting committee.
  :learners => [RandomForest(),PrunedTree(), DecisionStumpAdaboost()]
))
processModel(voteLearner)

Out[15]:

97.77777777777777

In [16]:

bestLearner = BestLearner(Dict(
  :output => :class,
  :partition_generator => (X, y) -> kfold(size(X, 1), 5),
  :selection_function => (learner_partition_scores) -> findmax(mean(learner_partition_scores, 2))[2],      
  :score_type => Real,
  :learners => [PrunedTree(), DecisionStumpAdaboost(), RandomForest()],
  :learner_options_grid => nothing
))
processModel(bestLearner)

Out[16]:

97.77777777777777

In [20]:

stackLearner = StackEnsemble(Dict(
  :output => :class,
  :learners => [PrunedTree(), DecisionStumpAdaboost(), RandomForest(),voteLearner,bestLearner],
  :stacker => RandomForest(),
  # Proportion of training set left to train stacker itself.
  :stacker_training_proportion => 0.3,
  :keep_original_features => false
))
processModel(stackLearner)

Out[20]:

95.55555555555556

In [21]:

results=@parallel (vcat) for i=1:30
   processModel(stackLearner)
end
println("acc = ",round(mean(results))," +/- ",round(std(results)))

acc = 94.0 +/- 4.0

In [22]:

results

Out[22]:

30-element Array{Float64,1}:
 100.0   
  93.3333
  95.5556
  88.8889
  91.1111
  93.3333
  93.3333
  95.5556
  97.7778
  95.5556
  88.8889
  93.3333
  91.1111
   ⋮     
  97.7778
  93.3333
  95.5556
  84.4444
  95.5556
  93.3333
  93.3333
  95.5556
  97.7778
  93.3333
  95.5556
  91.1111

In [17]:

#svmcrt = CRTLearner(Dict(
  # Output to train against
  # (:class).
  #:output => :class,
  #:learner => "rf",
  #:learner => "svmLinear2",
  #:learner => "rpart",
  #:learner => "lda",
  #:impl_options => Dict()
#))

In [ ]: