without_header = '../examples/data/blogData_train.csv' with_header = '../examples/data/blogData_train_with_header.csv' colnames = (1..281).to_a.map { |x| "v#{x}" } header = colnames.join(',') File.open(with_header, 'w') do |fo| fo.puts header File.foreach(without_header) do |li| fo.puts li end end require 'daru' df = Daru::DataFrame.from_csv '../examples/data/blogData_train_with_header.csv' puts "Number of rows: #{df.nrows}" puts "Number of columns: #{df.ncols}" # select a subset of columns of the data frame keep = ['v16', 'v41', 'v54', 'v62', 'v270', 'v271', 'v272', 'v273', 'v274', 'v275', 'v276', 'v277', 'v280'] blog_data = df[*keep] df = nil # assign meaningful names for the selected columns meaningful_names = [:host_comments_avg, :host_trackbacks_avg, :comments, :length, :mo, :tu, :we, :th, :fr, :sa, :su, :parents, :parents_comments] blog_data.vectors = Daru::Index.new(meaningful_names) # the resulting data set blog_data.head nonzero_ind = blog_data[:length].each_index.select do |i| blog_data[:length][i] > 0 && blog_data[:comments][i] > 0 end blog_data = blog_data.row[*nonzero_ind] puts "Remaining number of rows: #{blog_data.nrows}" days = Array.new(blog_data.nrows) { :unknown } [:mo, :tu, :we, :th, :fr, :sa, :su].each do |d| ind = blog_data[d].to_a.each_index.select { |i| blog_data[d][i]==1 } ind.each { |i| days[i] = d.to_s } blog_data.delete_vector(d) end blog_data[:day] = days blog_data.head 3 # create a binary indicator vector specifying if a blog post has at least # one parent post which has comments hpwc = (blog_data[:parents] * blog_data[:parents_comments]).to_a blog_data[:has_parent_with_comments] = hpwc.map { |t| t == 0 ? 'no' : 'yes'} blog_data.delete_vector(:parents) blog_data.delete_vector(:parents_comments) blog_data.head 3 log_comments = blog_data[:comments].to_a.map { |c| Math::log(c) } log_host_comments_avg = blog_data[:host_comments_avg].to_a.map { |c| Math::log(c) } blog_data[:log_comments] = log_comments blog_data[:log_host_comments_avg] = log_host_comments_avg blog_data.head 3 require 'mixed_models' model_fit = LMM.from_formula(formula: "log_comments ~ log_host_comments_avg + host_trackbacks_avg + length + has_parent_with_comments + (1 | day)", data: blog_data) model_fit.fix_ef model_fit.fix_ef_summary require 'gnuplotrb' include GnuplotRB x, y = model_fit.fitted, model_fit.residuals fitted_vs_residuals = Plot.new([[x,y], with: 'points', pointtype: 6, notitle: true], xlabel: 'Fitted', ylabel: 'Residuals') fitted_vs_residuals.term('png') bin_width = (y.max - y.min)/10.0 bins = (y.min..y.max).step(bin_width).to_a rel_freq = Array.new(bins.length-1){0.0} y.each do |r| 0.upto(bins.length-2) do |i| if r >= bins[i] && r < bins[i+1] then rel_freq[i] += 1.0/y.length end end end bins_center = bins[0...-1].map { |b| b + bin_width/2.0 } residuals_hist = Plot.new([[bins_center, rel_freq], with: 'boxes', notitle: true], style: 'fill solid 0.5') residuals_hist.term('png') require 'distribution' observed = model_fit.residuals.sort n = observed.length theoretical = (1..n).to_a.map { |t| Distribution::Normal.p_value(t.to_f/n.to_f) * model_fit.sigma} qq_plot = Plot.new([[theoretical, observed], with: 'points', pointtype: 6, notitle: true], ['x', with: 'lines', notitle: true], xlabel: 'Normal theoretical quantiles', ylabel: 'Observed quantiles', title: 'Q-Q plot of the residuals') qq_plot.term('png') puts "Obtained fixed effects coefficient estimates:" puts model_fit.fix_ef model_fit.fix_ef_p(method: :wald) conf_int = model_fit.fix_ef_conf_int(level: 0.95, method: :wald) ci = Daru::DataFrame.rows(conf_int.values, order: [:lower95, :upper95], index: model_fit.fix_ef_names) ci[:coef] = model_fit.fix_ef.values ci puts "Obtained random effects coefficient estimates:" puts model_fit.ran_ef model_fit.ran_ef_summary