Pkg.add("Discretizers") using Discretizers data = [:cat, :dog, :dog, :cat, :cat, :elephant] catdisc = CategoricalDiscretizer(data); println(":cat becomes: ", encode(catdisc, :cat)) println(":dog becomes: ", encode(catdisc, :dog)) println("data becomes: ", encode(catdisc, data)) println("1 becomes: ", decode(catdisc, 1)) println("2 becomes: ", decode(catdisc, 2)) println("[1,2,3] becomes: ", decode(catdisc, [1,2,3])) CategoricalDiscretizer(["A", "B", "C"]) CategoricalDiscretizer([5000, 1200, 100]) CategoricalDiscretizer([:dog, "hello world", NaN]); bin_edges = [0.0,0.5,1.0] lindisc = LinearDiscretizer(bin_edges); println("0.2 becomes: ", encode(lindisc, 0.2)) println("0.7 becomes: ", encode(lindisc, 0.7)) println("0.5 becomes: ", encode(lindisc, 0.5)) println("it works on arrays: ", encode(lindisc, [0.0,0.8,0.2])) println("1 becomes: ", decode(lindisc, 1)) println("2 becomes: ", decode(lindisc, 2)) println("it works on arrays: ", decode(lindisc, [2,1,2])) println("number of labels: ", nlabels(catdisc), " ", nlabels(lindisc)) println("bin centers: ", bincenters(lindisc)) println("extrama of a bin: ", extrema(lindisc, 2)) catdisc = CategoricalDiscretizer(data, Int32) lindisc = LinearDiscretizer(bin_edges, UInt8) encode(lindisc, 0.2) nbins = 3 data = randn(1000) edges = binedges(DiscretizeUniformWidth(nbins), data) using PGFPlots using Distributions using Random # draw a set of variables and # filter values to a reasonable range Random.seed!(0) data = [rand(Cauchy(-5, 1.8), 500); rand(Cauchy(-4, 0.8), 2000); rand(Cauchy(-1, 0.3), 500); rand(Cauchy( 2, 0.8), 1000); rand(Cauchy( 4, 1.5), 500)] data = filter!(x->-15.0 <= x <= 15.0, data) g = GroupPlot(3, 1, groupStyle = "horizontal sep = 1.75cm") discalgs = [("Uniform Width", DiscretizeUniformWidth(15)), ("Uniform Count", DiscretizeUniformCount(15)), ("Quantile", DiscretizeQuantile(15)), ("Bayesian Blocks", DiscretizeBayesianBlocks())] for (name, discalg) in discalgs disc = LinearDiscretizer(binedges(discalg, data)) counts = get_discretization_counts(disc, data) arr_x, arr_y = get_histogram_plot_arrays(disc.binedges, counts ./ binwidths(disc)) push!(g, Axis(Plots.Linear(arr_x, convert(Vector{Float64}, arr_y), style="const plot, mark=none, fill=blue!60"), ymin=0, xlabel="x", ylabel="pdf(x)", title=name, width="6cm")) end g g = GroupPlot(3, 3, groupStyle = "horizontal sep = 1.75cm, vertical sep = 1.5cm") discalgs = [:sqrt, # used by Excel and others for its simplicity and speed :sturges, # R's default method, only good for near-Gaussian data :rice, # commonly overestimates the number of bins required :doane, # improves Sturges’ for non-normal datasets. :scott, # less robust estimator that that takes into account data variability and data size. :fd, # Freedman Diaconis Estimator, robust :auto, # max between :fd and :sturges. Good all-round performance ] for discalg in discalgs disc = LinearDiscretizer(binedges(DiscretizeUniformWidth(discalg), data)) counts = get_discretization_counts(disc, data) arr_x, arr_y = get_histogram_plot_arrays(disc.binedges, counts ./ binwidths(disc)) push!(g, Axis(Plots.Linear(arr_x, convert(Vector{Float64}, arr_y), style="const plot, mark=none, fill=blue!60"), ymin=0, title=string(discalg))) end g data = [randn(100); randn(100).+1.0] labels = [fill(:cat, 100); fill(:dog, 100)] integer_labels = encode(CategoricalDiscretizer([:cat, :dog]), labels) edges = binedges(DiscretizeMODL_Optimal(), data, integer_labels)