#!/usr/bin/env python
# coding: utf-8

# ## RNN

# In[2]:


get_ipython().run_line_magic('reload_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')
get_ipython().run_line_magic('matplotlib', 'inline')

from fastai.io import *
from fastai.conv_learner import *

from fastai.column_data import *


# ## Setup

# We're going to download the collected works of Nietzsche to use as our data for this class.

# In[2]:


PATH = 'data/nietzsche/'


# In[3]:


get_data('https://s3.amazonaws.com/text-datasets/nietzsche.txt', f'{PATH}nietzsche.txt')
text = open(f'{PATH}nietzsche.txt').read()
print('corpus length:', len(text))


# In[4]:


text[:400]


# In[5]:


chars = sorted(list(set(text)))
vocab_size = len(chars) + 1
print('total chars:', vocab_size)


# Sometimes it's useful to have a zero value in the dataset, e.g. for padding

# In[6]:


chars.insert(0, '\0')
''.join(chars[1:-6])


# Map from chars to indices and back again

# In[7]:


char_indices = {c: i for i, c in enumerate(chars)}
indices_char = {i: c for i, c in enumerate(chars)}


# *idx* will be the data we use from now on - it simply converts all the characters to their index (based on the mapping above)

# In[8]:


idx = [char_indices[c] for c in text]
idx[:10]


# In[9]:


''.join(indices_char[i] for i in idx[:70])


# ## Three char model

# ### Create inputs

# Create a list of every 4th character, starting at the 0th, 1st, 2nd, then 3rd characters

# In[10]:


cs=3
c1_dat = [idx[i]     for i in range(0, len(idx) - cs, cs)]
c2_dat = [idx[i + 1] for i in range(0, len(idx) - cs, cs)]
c3_dat = [idx[i + 2] for i in range(0, len(idx) - cs, cs)]
c4_dat = [idx[i + 3] for i in range(0, len(idx) - cs, cs)]


# Our inputs

# In[11]:


x1 = np.stack(c1_dat)
x2 = np.stack(c2_dat)
x3 = np.stack(c3_dat)


# Our output

# In[12]:


y = np.stack(c4_dat)


# The first 4 inputs and outputs

# In[13]:


x1[:4], x2[:4], x3[:4]


# In[14]:


y[:4]


# In[15]:


x1.shape, y.shape


# ### Create and train model

# Pick a size for our hidden state

# In[16]:


n_hidden = 256


# The number of latent factors to create (i.e. the size of the embedding matrix)

# In[17]:


n_fac = 42


# In[18]:


class Char3Model(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)

        # The 'green arrow' from our diagram - the layer operation from input to hidden
        self.l_in = nn.Linear(n_fac, n_hidden)

        # The 'orange arrow' from our diagram - the layer operation from hidden to hidden
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        
        # The 'blue arrow' from our diagram - the layer operation from hidden to output
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, c1, c2, c3):
        in1 = F.relu(self.l_in(self.e(c1)))
        in2 = F.relu(self.l_in(self.e(c2)))
        in3 = F.relu(self.l_in(self.e(c3)))
        
        h = V(torch.zeros(in1.size()).cuda())
        h = F.tanh(self.l_hidden(h + in1))
        h = F.tanh(self.l_hidden(h + in2))
        h = F.tanh(self.l_hidden(h + in3))
        
        return F.log_softmax(self.l_out(h))


# In[19]:


md = ColumnarModelData.from_arrays('.', [-1], np.stack([x1,x2,x3], axis=1), y, bs=512)


# In[20]:


m = Char3Model(vocab_size, n_fac).cuda()


# In[21]:


it = iter(md.trn_dl)
*xs, yt = next(it)
t = m(*V(xs))


# In[22]:


opt = optim.Adam(m.parameters(), 1e-2)


# In[23]:


fit(m, md, 1, opt, F.nll_loss)


# In[24]:


set_lrs(opt, 0.001)


# In[25]:


fit(m, md, 1, opt, F.nll_loss)


# ### Test model

# In[26]:


def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = m(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]


# In[27]:


get_next('y. ')


# In[28]:


get_next('ppl')


# In[29]:


get_next(' th')


# In[30]:


get_next('and')


# ## Our first RNN!

# ### Create inputs

# This is the size of our unrolled RNN.

# In[31]:


cs = 8


# For each of 0 through 7, create a list of every 8th character with that starting point. These will be the 8 inputs to our model.

# In[32]:


c_in_dat = [[idx[i + j] for i in range(cs)] for j in range(len(idx) - cs)]


# Then create a list of the next character in each of these series. This will be the labels for our model.

# In[33]:


c_out_dat = [idx[j + cs] for j in range(len(idx) - cs)]


# In[34]:


xs = np.stack(c_in_dat, axis=0)


# In[35]:


xs.shape


# In[36]:


y = np.stack(c_out_dat)


# So each column below is one series of 8 characters from the text.

# In[37]:


xs[:cs, :cs]


# ...and this is the next character after each sequence.

# In[38]:


y[:cs]


# ### Create and train model

# In[39]:


val_idx = get_cv_idxs(len(idx) - cs - 1)


# In[40]:


md = ColumnarModelData.from_arrays('.', val_idx, xs, y, bs=512)


# In[41]:


class CharLoopModel(nn.Module):
    # This is an RNN!
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.l_in = nn.Linear(n_fac, n_hidden)
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(bs, n_hidden).cuda())
        for c in cs:
            inp = F.relu(self.l_in(self.e(c)))
            h = F.tanh(self.l_hidden(h + inp))
        
        return F.log_softmax(self.l_out(h), dim=-1)


# In[42]:


m = CharLoopModel(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-2)


# In[43]:


fit(m, md, 1, opt, F.nll_loss)


# In[44]:


set_lrs(opt, 0.001)


# In[45]:


fit(m, md, 1, opt, F.nll_loss)


# In[46]:


class CharLoopConcatModel(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.l_in = nn.Linear(n_fac + n_hidden, n_hidden)
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(bs, n_hidden).cuda())
        for c in cs:
            inp = torch.cat((h, self.e(c)), 1)
            inp = F.relu(self.l_in(inp))
            h = F.tanh(self.l_hidden(inp))
        
        return F.log_softmax(self.l_out(h), dim=-1)


# In[47]:


m = CharLoopConcatModel(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-3)


# In[48]:


it = iter(md.trn_dl)
*xs, yt = next(it)
t = m(*V(xs))


# In[49]:


fit(m, md, 1, opt, F.nll_loss)


# In[50]:


set_lrs(opt, 1e-4)


# In[51]:


fit(m, md, 1, opt, F.nll_loss)


# ### Test model

# In[52]:


def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = m(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]


# In[53]:


get_next('for thos')


# In[54]:


get_next('part of ')


# In[55]:


get_next('queens a')


# ## RNN with pytorch

# In[56]:


class CharRnn(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNN(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(1, bs, n_hidden))
        inp = self.e(torch.stack(cs))
        outp,h = self.rnn(inp, h)
        
        return F.log_softmax(self.l_out(outp[-1]), dim=-1)


# In[57]:


m = CharRnn(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-3)


# In[58]:


it = iter(md.trn_dl)
*xs, yt = next(it)


# In[59]:


t = m.e(V(torch.stack(xs)))
t.size()


# In[60]:


ht = V(torch.zeros(1, 512, n_hidden))
outp, hn = m.rnn(t, ht)
outp.size(), hn.size()


# In[61]:


t = m(*V(xs))
t.size()


# In[62]:


fit(m, md, 4, opt, F.nll_loss)


# In[63]:


set_lrs(opt, 1e-4)


# In[64]:


fit(m, md, 2, opt, F.nll_loss)


# ### Test model

# In[65]:


def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = m(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]


# In[66]:


get_next('for thos')


# In[67]:


def get_next_n(inp, n):
    res = inp
    for i in range(n):
        c = get_next(inp)
        res += c
        inp = inp[1:] + c
    return res


# In[68]:


get_next_n('for thos', 40)


# ## Multi-output model

# ### Setup

# Let's take non-overlapping sets of characters this time

# In[69]:


c_in_dat = [[idx[i + j] for i in range(cs)] for j in range(0, len(idx) - cs - 1, cs)]


# Then create the exact same thing, offset by 1, as our labels

# In[70]:


c_out_dat = [[idx[i + j] for i in range(cs)] for j in range(1, len(idx) - cs, cs)]


# In[71]:


xs = np.stack(c_in_dat)
xs.shape


# In[72]:


ys = np.stack(c_out_dat)
ys.shape


# In[73]:


xs[:cs, :cs]


# In[74]:


ys[:cs, :cs]


# ### Create and train model

# In[75]:


val_idx = get_cv_idxs(len(xs) - cs - 1)


# In[76]:


md = ColumnarModelData.from_arrays('.', val_idx, xs, ys, bs=512)


# In[77]:


class CharSeqRnn(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNN(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(1, bs, n_hidden))
        inp = self.e(torch.stack(cs))
        outp,h = self.rnn(inp, h)
        return F.log_softmax(self.l_out(outp), dim=-1)


# In[78]:


m = CharSeqRnn(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-3)


# In[79]:


it = iter(md.trn_dl)
*xst, yt = next(it)


# In[80]:


def nll_loss_seq(inp, targ):
    sl, bs, nh = inp.size()
    targ = targ.transpose(0,1).contiguous().view(-1)
    return F.nll_loss(inp.view(-1, nh), targ)


# In[81]:


fit(m, md, 4, opt, nll_loss_seq)


# In[82]:


set_lrs(opt, 1e-4)


# In[83]:


fit(m, md, 1, opt, nll_loss_seq)


# ### Identity init!

# In[84]:


m = CharSeqRnn(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-2)


# In[85]:


m.rnn.weight_hh_l0.data.copy_(torch.eye(n_hidden))


# In[86]:


fit(m, md, 4, opt, nll_loss_seq)


# In[87]:


set_lrs(opt, 1e-3)


# In[88]:


fit(m, md, 4, opt, nll_loss_seq)


# ## Stateful model

# ### Setup

# In[2]:


from torchtext import vocab, data
from fastai.nlp import *
from fastai.lm_rnn import *

PATH='data/nietzsche/'

TRN_PATH = 'trn/'
VAL_PATH = 'val/'
TRN = f'{PATH}{TRN_PATH}'
VAL = f'{PATH}{VAL_PATH}'

get_ipython().run_line_magic('ls', '{PATH}')


# In[2]:


get_ipython().run_line_magic('ls', '{PATH}trn')


# In[3]:


TEXT = data.Field(lower=True, tokenize=list)
bs=64
bptt=8
n_fac=42
n_hidden=256

FILES = dict(train=TRN_PATH, validation=VAL_PATH, test=VAL_PATH)
md = LanguageModelData.from_text_files(PATH, TEXT, **FILES, bs=bs, bptt=bptt, min_freq=3)

len(md.trn_dl), md.nt, len(md.trn_ds), len(md.trn_ds[0].text)


# ### RNN

# In[4]:


class CharSeqStatefulRnn(nn.Module):
    def __init__(self, vocab_size, n_fac, bs):
        self.vocab_size = vocab_size
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNN(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h.size(1) != bs: self.init_hidden(bs)
        outp,h = self.rnn(self.e(cs), self.h)
        self.h = repackage_var(h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs): self.h = V(torch.zeros(1, bs, n_hidden))


# In[5]:


m = CharSeqStatefulRnn(md.nt, n_fac, 512).cuda()
opt = optim.Adam(m.parameters(), 1e-3)


# In[6]:


fit(m, md, 4, opt, F.nll_loss)


# In[7]:


set_lrs(opt, 1e-4)

fit(m, md, 4, opt, F.nll_loss)


# ### RNN loop

# In[8]:


class CharSeqStatefulRnn2(nn.Module):
    def __init__(self, vocab_size, n_fac, bs):
        super().__init__()
        self.vocab_size = vocab_size
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNNCell(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h.size(1) != bs: self.init_hidden(bs)
        outp = []
        o = self.h
        for c in cs: 
            o = self.rnn(self.e(c), o)
            outp.append(o)
        outp = self.l_out(torch.stack(outp))
        self.h = repackage_var(o)
        return F.log_softmax(outp, dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs): self.h = V(torch.zeros(1, bs, n_hidden))


# In[9]:


m = CharSeqStatefulRnn2(md.nt, n_fac, 512).cuda()
opt = optim.Adam(m.parameters(), 1e-3)


# In[10]:


fit(m, md, 4, opt, F.nll_loss)


# ### GRU

# In[11]:


class CharSeqStatefulGRU(nn.Module):
    def __init__(self, vocab_size, n_fac, bs):
        super().__init__()
        self.vocab_size = vocab_size
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.GRU(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h.size(1) != bs: self.init_hidden(bs)
        outp,h = self.rnn(self.e(cs), self.h)
        self.h = repackage_var(h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs): self.h = V(torch.zeros(1, bs, n_hidden))


# In[12]:


m = CharSeqStatefulGRU(md.nt, n_fac, 512).cuda()

opt = optim.Adam(m.parameters(), 1e-3)


# In[13]:


fit(m, md, 6, opt, F.nll_loss)


# In[14]:


set_lrs(opt, 1e-4)


# In[15]:


fit(m, md, 3, opt, F.nll_loss)


# ### Putting it all together: LSTM

# In[16]:


from fastai import sgdr
n_hidden = 512


# In[17]:


class CharSeqStatefulLSTM(nn.Module):
    def __init__(self, vocab_size, n_fac, bs, nl):
        super().__init__()
        self.vocab_size,self.nl = vocab_size,nl
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.LSTM(n_fac, n_hidden, nl, dropout=0.5)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h[0].size(1) != bs: self.init_hidden(bs)
        outp,h = self.rnn(self.e(cs), self.h)
        self.h = repackage_var(h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs):
        self.h = (V(torch.zeros(self.nl, bs, n_hidden)),
                  V(torch.zeros(self.nl, bs, n_hidden)))


# In[18]:


m = CharSeqStatefulLSTM(md.nt, n_fac, 512, 2).cuda()
lo = LayerOptimizer(optim.Adam, m, 1e-2, 1e-5)


# In[19]:


os.makedirs(f'{PATH}models', exist_ok=True)


# In[20]:


fit(m, md, 2, lo.opt, F.nll_loss)


# In[21]:


on_end = lambda sched, cycle: save_model(m, f'{PATH}models/cyc_{cycle}')
cb = [CosAnneal(lo, len(md.trn_dl), cycle_mult=2, on_cycle_end=on_end)]
fit(m, md, 2**4-1, lo.opt, F.nll_loss, callbacks=cb)


# ### Test

# In[22]:


def get_next(inp):
    idxs = TEXT.numericalize(inp)
    p = m(VV(idxs.transpose(0, 1)))
    r = torch.multinomial(p[-1].exp(), 1)
    return TEXT.vocab.itos[to_np(r)[0]]


# In[23]:


get_next('for thos')


# In[24]:


def get_next_n(inp, n):
    res = inp
    for i in range(n):
        c = get_next(inp)
        res += c
        inp = inp[1:] + c
    return res


# In[25]:


print(get_next_n('for thos', 400))