using Pkg Pkg.activate("../../../lessons/") Pkg.instantiate(); using CSV using DataFrames using LinearAlgebra using Distributions using StatsFuns using RxInfer using Plots default(label="", margin=10Plots.pt) # Read CSV file df = DataFrame(CSV.File("../datasets/stock_exchange.csv")) # Count number of samples time_period = 436:536 num_samples = length(time_period) # Extract columns dates_num = 1:num_samples dates_str = df[time_period,1] stock_val = df[time_period,2] # Set xticks xtick_points = Int64.(round.(range(1, stop=num_samples, length=5))) # Scatter exchange levels scatter(dates_num, stock_val, color="black", label="", ylabel="Stock Market Levels", xlabel="time (days)", xticks=(xtick_points, [dates_str[i] for i in xtick_points]), size=(800,300)) @model function linear_regression(μ_θ, Σ_θ, σ2_y; N=1) "Bayesian linear regression" # Allocate data variables X = datavar(Vector{Float64}, N) y = datavar(Float64, N) # Prior distribution of coefficients θ ~ MvNormalMeanCovariance(μ_θ, Σ_θ) for i = 1:N # Likelihood of i-th sample y[i] ~ NormalMeanVariance(dot(θ,X[i]), σ2_y) end return y, X, θ end # Prior parameters μ_θ, Σ_θ = (zeros(2), diagm(ones(2))) # Likelihood variance σ2_y = 1.0; # Call inference function results = inference( model = linear_regression(μ_θ, Σ_θ, σ2_y, N=num_samples), data = (y = stock_val, X = [[dates_num[i], 1.0] for i in 1:num_samples]), returnvars = (θ = KeepLast()), ) # Extract posterior weights post_θ = results.posteriors[:θ] # Define ranges for plot x1 = range(-1.5, length=500, stop=1.5) x2 = range(-2.5, length=500, stop=2.5) # Draw contour plots of distributions prior_θ = MvNormal(μ_θ, Σ_θ) p1a = contour(x1, x2, (x1,x2) -> pdf(prior_θ, [x1,x2]), xlabel="θ1", ylabel="θ2", title="prior", label="") p1b = contour(x1, x2, (x1,x2) -> pdf(post_θ, [x1,x2]), xlabel="θ1", title="posterior", label="") plot(p1a, p1b, size=(900,300)) # Extract estimated weights θ_MAP = mode(post_θ) # Report results println("Slope coefficient = "*string(θ_MAP[1])) println("Intercept coefficient = "*string(θ_MAP[2])) # Make predictions regression_estimated = dates_num * θ_MAP[1] .+ θ_MAP[2]; # Visualize observations scatter(dates_num, stock_val, color="black", xticks=(xtick_points, [dates_str[i] for i in xtick_points]), label="observations", legend=:topleft) # Overlay regression function plot!(dates_num, regression_estimated, color="blue", label="regression", linewidth=2, size=(800,300)) # Read CSV file df = DataFrame(CSV.File("../datasets/diagnosis_train.csv")) # Split dataframe into features and labels features_train = Matrix(df[:,1:5]) labels_train = Vector(df[:,6]) # Store number of features num_features = size(features_train,2) # Number of training samples num_train = size(features_train,1); plot(xlabel="feature1", ylabel="feature2", size=(800,400)) scatter!(features_train[labels_train .== 0, 1], features_train[labels_train .== 0, 2], color="blue", label="Healthy Controls") scatter!(features_train[labels_train .== 1, 1], features_train[labels_train .== 1, 2], color="red", label="COPD patients") # Parameters for priors μ_θ = zeros(num_features+1,) Σ_θ = diagm(ones(num_features+1)); @model function linear_classification(μ_θ, Σ_θ; N=1) "Bayesian classification model" # Allocate data variables X = datavar(Vector{Float64}, N) y = datavar(Float64, N) # Weight prior distribution θ ~ MvNormalMeanCovariance(μ_θ, Σ_θ) # Binary likelihood for i = 1:N y[i] ~ Probit(dot(θ, X[i])) end return y, X, θ end results = inference( model = linear_classification(μ_θ, Σ_θ, N=num_train), data = (y = labels_train, X = [[features_train[i,:]; 1.0] for i in 1:num_train]), returnvars = (θ = KeepLast()), iterations = 10, ) # Coefficients to visualize cix = [1,2] # Reduce posterior distribution to chosen dimensions m_cix = mean(results.posteriors[:θ])[cix] S_cix = cov( results.posteriors[:θ])[cix,cix] post_θ = MvNormal(m_cix, S_cix) # Reduce prior distribution to chosen dimensions prior_θ = MvNormal(μ_θ[cix], Σ_θ[cix,cix]) # Define ranges for plot x1 = range(-1., length=500, stop=1.) x2 = range(-1., length=500, stop=1.) # Draw contour plots of distributions p1a = contour(x1, x2, (x1,x2) -> pdf(prior_θ, [x1,x2]), xlabel="θ1", ylabel="θ2", title="prior", label="") p1b = contour(x1, x2, (x1,x2) -> pdf(post_θ, [x1,x2]), xlabel="θ1", title="posterior", label="") plot(p1a, p1b, size=(900,300)) # Read CSV file df = DataFrame(CSV.File("../datasets/diagnosis_test.csv")) # Split dataframe into features and labels features_test = Matrix(df[:,1:5]) labels_test = Vector(df[:,6]) # Number of test samples num_test = size(features_test,1); # Extract MAP estimate of classification parameters θ_MAP = mode(results.posteriors[:θ]) # Compute dot product between parameters and test data fθ_pred = [features_test ones(num_test,)] * θ_MAP # Predict labels through probit labels_pred = round.(normcdf.(fθ_pred)); # Compute classification accuracy of test data accuracy_test = mean(labels_test .== labels_pred) # Report result println("Test Accuracy = "*string(accuracy_test*100)*"%")