w = [3.0, -4.0, 5.0];   # the underlying model coefficients

n = 10000; X = randn(3, n);  # generate 10000 sample features

sig = 0.1; y = vec(w'X) + sig * randn(n); # generate the responses, adding some noise

using SGDOptim

rmodel = riskmodel(LinearPred(3),  # use linear prediction x -> w'x, 3 is the input dimension
                   SqrLoss())      # use squared loss: loss(u, y) = (u - y)^2/2

w_e = sgd(rmodel,
    zeros(3),      # the initial guess
    minibatch_seq(X, y, 10),    # supply the data in mini-batches, each with 10 samples
    reg = SqrL2Reg(1.0e-4),     # add a squared L2 regression with coefficient 1.0e-4
    lrate = t->1.0/(100.0 + t), # set the rule of learning rate 
    cbinterval = 100,           # invoke the callback every 100 iterations
    callback = simple_trace)    # print the optimization trace in callback

sumabs2(w_e - w) / sumabs2(w)