without_header = '../examples/data/blogData_train.csv'
with_header = '../examples/data/blogData_train_with_header.csv'
colnames = (1..281).to_a.map { |x| "v#{x}" }
header = colnames.join(',')
File.open(with_header, 'w') do |fo|
  fo.puts header
  File.foreach(without_header) do |li|
    fo.puts li
  end
end

require 'daru'
df = Daru::DataFrame.from_csv '../examples/data/blogData_train_with_header.csv'
puts "Number of rows: #{df.nrows}"
puts "Number of columns: #{df.ncols}"

# select a subset of columns of the data frame
keep = ['v16', 'v41', 'v54', 'v62', 'v270', 'v271', 'v272', 
        'v273', 'v274', 'v275', 'v276', 'v277', 'v280']
blog_data = df[*keep]
df = nil

# assign meaningful names for the selected columns
meaningful_names = [:host_comments_avg, :host_trackbacks_avg, 
                    :comments, :length, :mo, :tu, :we, :th, 
                    :fr, :sa, :su, :parents, :parents_comments]
blog_data.vectors = Daru::Index.new(meaningful_names)

# the resulting data set
blog_data.head

nonzero_ind = blog_data[:length].each_index.select do |i| 
  blog_data[:length][i] > 0 && blog_data[:comments][i] > 0
end
blog_data = blog_data.row[*nonzero_ind]
puts "Remaining number of rows: #{blog_data.nrows}"

days = Array.new(blog_data.nrows) { :unknown }
[:mo, :tu, :we, :th, :fr, :sa, :su].each do |d|
  ind = blog_data[d].to_a.each_index.select { |i| blog_data[d][i]==1 }
  ind.each { |i| days[i] = d.to_s }
  blog_data.delete_vector(d)
end
blog_data[:day] = days
blog_data.head 3

# create a binary indicator vector specifying if a blog post has at least 
# one parent post which has comments
hpwc = (blog_data[:parents] * blog_data[:parents_comments]).to_a
blog_data[:has_parent_with_comments] = hpwc.map { |t| t == 0 ? 'no' : 'yes'} 
blog_data.delete_vector(:parents)
blog_data.delete_vector(:parents_comments)
blog_data.head 3

log_comments = blog_data[:comments].to_a.map { |c| Math::log(c) }
log_host_comments_avg = blog_data[:host_comments_avg].to_a.map { |c| Math::log(c) }
blog_data[:log_comments] = log_comments
blog_data[:log_host_comments_avg] = log_host_comments_avg
blog_data.head 3

require 'mixed_models'
model_fit = LMM.from_formula(formula: "log_comments ~ log_host_comments_avg + host_trackbacks_avg + length + has_parent_with_comments + (1 | day)", data: blog_data)
model_fit.fix_ef

model_fit.fix_ef_summary

require 'gnuplotrb'
include GnuplotRB

x, y = model_fit.fitted, model_fit.residuals
fitted_vs_residuals = Plot.new([[x,y], with: 'points', pointtype: 6, notitle: true],
                               xlabel: 'Fitted', ylabel: 'Residuals')
fitted_vs_residuals.term('png')

bin_width = (y.max - y.min)/10.0
bins = (y.min..y.max).step(bin_width).to_a
rel_freq = Array.new(bins.length-1){0.0}
y.each do |r|
  0.upto(bins.length-2) do |i|
    if r >= bins[i] && r < bins[i+1] then
      rel_freq[i] += 1.0/y.length
    end
  end
end
bins_center = bins[0...-1].map { |b| b + bin_width/2.0 }
  
residuals_hist = Plot.new([[bins_center, rel_freq], with: 'boxes', notitle: true],
                           style: 'fill solid 0.5')
residuals_hist.term('png')

require 'distribution'

observed = model_fit.residuals.sort
n = observed.length
theoretical = (1..n).to_a.map { |t| Distribution::Normal.p_value(t.to_f/n.to_f) * model_fit.sigma}
qq_plot = Plot.new([[theoretical, observed], with: 'points', pointtype: 6, notitle: true],
                   ['x', with: 'lines', notitle: true],
                   xlabel: 'Normal theoretical quantiles', ylabel: 'Observed quantiles',
                   title: 'Q-Q plot of the residuals')
qq_plot.term('png')

puts "Obtained fixed effects coefficient estimates:"
puts model_fit.fix_ef

model_fit.fix_ef_p(method: :wald)

conf_int = model_fit.fix_ef_conf_int(level: 0.95, method: :wald)
ci = Daru::DataFrame.rows(conf_int.values, order: [:lower95, :upper95], index: model_fit.fix_ef_names)
ci[:coef] = model_fit.fix_ef.values
ci

puts "Obtained random effects coefficient estimates:"
puts model_fit.ran_ef

model_fit.ran_ef_summary