# Language modeling with simple MLP

- Manual backprop
- Improve training loop

Sources: 
- https://github.com/karpathy/nn-zero-to-hero
- https://github.com/karpathy/makemore
- https://huggingface.co/course/chapter6/6?fw=pt

Resources:
- https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue_no_trainer.py
- https://huggingface.co/course/chapter1/1

In [None]:
import torch
import random
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
from collections import defaultdict
%matplotlib inline

In [None]:
# download the names.txt file from github
!wget -O input.txt https://raw.githubusercontent.com/karpathy/makemore/master/names.txt
# !wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

In [None]:
# Set seeds
torch.manual_seed(42)
random.seed(42)

In [None]:
# read in all the words
words = open('names.txt', 'r').read().splitlines()
words[:8]

In [None]:
len(words)

### Manual backprop

In [None]:
# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
print(itos)

In [None]:
# build the dataset
block_size = 3 # context length: how many characters do we take to predict the next one?

def build_dataset(words):  
  X, Y = [], []
  for w in words:

    #print(w)
    context = [0] * block_size
    for ch in w + '.':
      ix = stoi[ch]
      X.append(context)
      Y.append(ix)
      #print(''.join(itos[i] for i in context), '--->', itos[ix])
      context = context[1:] + [ix] # crop and append

  X = torch.tensor(X)
  Y = torch.tensor(Y)
  print(X.shape, Y.shape)
  return X, Y

import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])

In [None]:
Xtr[0]
Ytr[0]

In [None]:
# Init HP
vocab_size = len(itos)
emb_dim = 2
block_size = 3
h_dim = 100
lr=0.1
max_steps=100000

In [None]:
# utility function we will use later when comparing manual gradients to PyTorch gradients
def cmp(s, dt, t):
  ex = torch.all(dt == t.grad).item()
  app = torch.allclose(dt, t.grad)
  maxdiff = (dt - t.grad).abs().max().item()
  print(f'{s:15s} | exact: {str(ex):5s} | approximate: {str(app):5s} | maxdiff: {maxdiff}')

In [None]:
# Get random batch
g = torch.Generator().manual_seed(42) # for reproducibility
C = torch.randn((vocab_size, emb_dim), generator=g)
W1 = torch.randn((emb_dim * block_size, h_dim), generator=g)
b1 = torch.randn(h_dim, generator=g)
W2 = torch.randn((h_dim, vocab_size), generator=g)
b2 = torch.randn(vocab_size , generator=g)
batch_size= 32
ix = torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)
Xb, Yb = Xtr[ix], Ytr[ix] # batch X,Y
parameters = [C, W1, b1, W2, b2]
for p in parameters:
  p.requires_grad = True

In [None]:
# Chunkated forward pass
emb = C[Xb]
embcat = emb.view(-1, 6)
hpreact = embcat @ W1 + b1
h = torch.tanh(hpreact)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Yb)

# PyTorch backward pass
for p in parameters:
    p.grad = None
for t in [logits, h, hpreact, embcat, emb]:
    t.retain_grad()
loss.backward()
loss

In [None]:
# Recap chain rule: h(x) = f(g(x)) ==> h'(x) = f'(g(x))g'(x) | df/dx = df/dg * dg/dx
# Derivative of loss for single observation

# Loss

# Layer 2

# Tanh

# Layer 1

# Concatenation


# Embedding

cmp('logits', dlogits, logits)
cmp('h', dh, h)
cmp('W2', dW2, W2)
cmp('b2', db2, b2)
cmp('hpreact', dhpreact, hpreact)
cmp('embcat', dembcat, embcat)
cmp('W1', dW1, W1)
cmp('b1', db1, b1)
cmp('emb', demb, emb)
cmp('C', dC, C)

In [None]:
# Model
g = torch.Generator().manual_seed(42) # for reproducibility
C = torch.randn((vocab_size, emb_dim), generator=g)
W1 = torch.randn((emb_dim * block_size, h_dim), generator=g)
b1 = torch.randn(h_dim, generator=g)
W2 = torch.randn((h_dim, vocab_size), generator=g)
b2 = torch.randn(vocab_size , generator=g)

parameters = [C, W1, b1, W2, b2]
print(sum(p.nelement() for p in parameters)) # number of parameters in total
for p in parameters:
  p.requires_grad = True

# same optimization as last time
batch_size = 32
losses = []

# use this context manager for efficiency once your backward pass is written (TODO)
# with torch.no_grad():
step = 0
for i in range(max_steps):
  # Batching
  ix = torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)
  Xb, Yb = Xtr[ix], Ytr[ix]
  
  # Forward pass
  # Embedding
  emb = C[Xb] # embed the characters into vectors
  embcat = emb.view(emb.shape[0], -1) # concatenate the vectors
  # Linear layer 1
  hpreact = embcat @ W1 + b1 # hidden layer pre-activation
  # Non-linearity
  h = torch.tanh(hpreact) # hidden layer
  # Linear layer 2
  logits = h @ W2 + b2 # output layer
  # Loss
  loss = F.cross_entropy(logits, Yb)


  # backward pass
  for p in parameters:
    p.grad = None
  loss.backward() # use this for correctness comparisons, delete it later!
  
  grads = []
  # # manual backprop
  # # -----------------
  # # -----------------

  
  # update
  for p, grad in zip(parameters, grads):
    p.data += -lr * p.grad # (using PyTorch grad from .backward())
    # p.data += -lr * grad

  # track stats
  losses.append(loss)

  # track stats
  if i % 10000 == 0: # print every once in a while
      print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
  losses.append(loss.item())

  if i > 10000:
    break
        

### Improve the training loop

In [None]:
# Training loop from last session
lr=0.1
max_steps = 100000
losses = []
batch_size = 32

for i in range(max_steps):
    # Batching ==> Replace with batches from pytorch dataloader
    ix = torch.randint(0, Xtr.shape[0], (batch_size,)) 
    
    # Forward pass ==> Replace with call to our pytorch model
    emb = C[Xtr[ix]] # (32, 3, 10)
    h = torch.tanh(emb.view(-1, block_size*emb_dim) @ W1 + b1) # (32, 100)
    logits = h @ W2 + b2 # (32, 27)
    loss = F.cross_entropy(logits, Ytr[ix])
    
    # Reset gradients ==> Replace with the functions of our pytorch model
    for p in parameters:
        p.grad = None
        
    # Backward pass
    loss.backward()
    
    # Stochastic gradient descent ==> Replace with an optimizer from pytorch
    for p in parameters:
        p.data += -lr * p.grad

    # track stats
    if i % 10000 == 0: # print every once in a while
        print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
    losses.append(loss.item())

#### Model

In [None]:
# Model from last session
vocab_size = len(itos)
emb_dim = 2
block_size = 3
h_dim = 200
g = torch.Generator().manual_seed(42) # for reproducibility
C = torch.randn((vocab_size, emb_dim), generator=g)
W1 = torch.randn((emb_dim * block_size, h_dim), generator=g)
b1 = torch.randn(h_dim, generator=g)
W2 = torch.randn((h_dim, vocab_size), generator=g)
b2 = torch.randn(vocab_size , generator=g)
parameters = [C, W1, b1, W2, b2]

In [None]:
batch_size=32
# Forward pass from last session
ix = torch.randint(0, Xtr.shape[0], (batch_size,))
# Forward pass
emb = C[Xtr[ix]] # (32, 3, 10)
h = torch.tanh(emb.view(-1, block_size*emb_dim) @ W1 + b1) # (32, 100)
logits = h @ W2 + b2 # (32, 27)
loss = F.cross_entropy(logits, Ytr[ix])

In [None]:
class MLP(torch.nn.Module):
    def __init__(self, embedding_dim=2, block_size=3, hidden_dim=100, vocab_size=27, *args, **kwargs) -> None:
        # Define components and hyperparamters of your model
        super().__init__(*args, **kwargs)
        self.block_size = block_size
        self.cat_dim = embedding_dim * block_size
        self.C = torch.nn.Embedding(vocab_size, embedding_dim) # Why do we use torch.nn.Embedding?
        self.dense = torch.nn.Linear(embedding_dim*block_size, hidden_dim)
        self.out = torch.nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, y=None):
        # Define how a forward pass is carried out
        emb = self.C(x)
        h = self.dense(emb.view(-1, self.cat_dim))
        h = F.tanh(h)
        logits = self.out(h)

        if y is not None:
            loss = F.cross_entropy(logits, y)
        return logits, loss if y is not None else logits

In [None]:
model = MLP()

In [None]:
# Initialization of weights in Pytorch?
model(Xb,Yb)[1]

In [None]:
# Expected loss without training
-torch.tensor(1/27).log()

In [None]:
# Evaluation function
@torch.inference_mode() # @torch.no_grad()
def evaluate(model, loader, device):
    model.eval()
    losses = []
    for batch in loader:
        batch = [t.to(device) for t in batch]
        _, loss = model(*batch)
        # Logging our metrics
        losses.append(loss)
    mean_loss = torch.tensor(losses).mean().item()
    model.train() # reset model back to training mode
    return mean_loss

#### Dataloader | Dataset

In [None]:
# Implement dataset
class NameDataset(torch.utils.data.Dataset):
    def __init__(self, X, Y=None) -> None:
        # Setup the data
        super().__init__()
        # This could include loading and preprocessing of the data
        self.X = X
        if Y is not None:
            self.Y = Y
    
    def __len__(self):
        # Get the lenght of the dataset
        return self.X.shape[0]

    def __getitem__(self, idx):
        # Get an instance of the dataset given and index
        return self.X[idx], self.Y[idx] if self.Y is not None else self.X[idx]


#### Putting everything togehter

In [None]:
lr=0.01
max_epochs = 10
batch_size=32
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Dealing with GPU and CPU training

losses = []

train_dataset = NameDataset(Xtr, Ytr) # Creating our own train dataset
val_dataset = NameDataset(Xdev, Ydev) # Creating our own validation dataset

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True) # Wrap the train dataset into a dataloader
val_dataloder = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False) # Wrap the validation dataset into a dataloader

optimizer = torch.optim.SGD(model.parameters(), lr=0.01) # Define an optimizer

print(f"Number of training instances: {len(train_dataset)}")
print(f"Number of training batches per epoch: {len(train_dataloader)}")

max_steps =  max_epochs * len(train_dataloader) # Compute the maxium number of steps for logging purposes

model.to(device)

step = 0
for epoch in range(max_epochs):
    val_losses = []

    # Set model in training mode
    model.train()
    # ix = torch.randint(0, Xtr.shape[0], (batch_size,)) 
    for batch in train_dataloader:
        # Move to device
        batch = [t.to(device) for t in batch] # Why do we iterate through the elements of batch?
        # Forward pass
        # emb = C[Xtr[ix]] # (32, 3, 10)
        # h = torch.tanh(emb.view(-1, block_size*emb_dim) @ W1 + b1) # (32, 200)
        # logits = h @ W2 + b2 # (32, 27)
        # loss = F.cross_entropy(logits, Ytr[ix])
        logits, loss = model(*batch) # What does *batch do?
        # Reset gradients
        # for p in model.parameters():
        #     p.grad = None
        model.zero_grad(set_to_none=True)
        # Backward pass
        loss.backward()
        
        # Stochastic gradient descent
        # for p in parameters:
        #     p.data += -lr * p.grad
        optimizer.step()

        # track stats
        losses.append(loss)
        
        step += 1

    # Logging (customize to your needs)
    print(f'Loss: {step:7d}/{max_steps:7d}: {loss.item():.4f}')
    
    # Evaluate after each epoch (customize to your needs)
    eval_loss = evaluate(model, val_dataloder, device)
    print(f'Val loss: {step:7d}/{max_steps:7d}: {eval_loss:.4f}')



In [None]:
plt.plot(torch.tensor(losses).view(-1, len(train_dataloader)).mean(1))

### Coding Exercises:
- Train a model on the CBOW architecture using our dataset
- Experiment with different optimizers, learning rates

### Pen & Paper Exercises:
- Do the math and show that our gradient for the cross entropy loss is correct
- Do the math and show that our gradient for the linear layer is correct
- Given the following model: $f(x) = w_3x^3+w_2x^2+w_1x+w_0$; optimize the models parameters (i.e., $w_0,w_1,w_2,w_3$) using stochastic gradient descent, given the following training examples (x,y): {(1,3),(-1,-5),(0,-3)} and the squared error loss $(y - f(x))^2$. Update the models parameters after each example. Use a learning rate $\eta = 0.1$ and initialize all parameters to $1$ (i.e., $w_0=w_1=w_2=w_3=1$).\
Solution:\
(1,3): $w_0=w_1=w_2=w_3=0.8$\
(-1,-5): $w_0=-0.2$, $w_1=1.8$, $w_2=-0.2$, $w_3=1.8$\
(0,-3): $w_0=-0.76$, $w_1=1.8$, $w_2=-0.2$, $w_3=1.8$
