Refactor train and generare

joaoflf · joaoflf · commit e97f13c887a2 · 2023-07-12T15:03:07.000+01:00
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,5 @@
+checkpoints/
+.DS_Store
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
diff --git a/README.md b/README.md
@@ -140,9 +140,37 @@ Next, install the dependencies:
 ```bash
 pip install -r requirements.txt
 ```
+&nbsp;
+### Training the Model
 
-You can then run the main script:
+The `train.py` script trains the model. It accepts the following command line arguments:
 
-```bash
-python main.py
-```
+- `--iters`: Total iterations to train. Default is 5000.
+- `--lr`: Learning rate. Default is 3e-4.
+- `--device`: Device to use for training. Default is "cuda" if CUDA is available, otherwise "mps".
+- `--checkpoint_dir`: Directory to save the model checkpoints. Default is "checkpoints".
+
+Example usage:
+
+```shell
+python train.py --iters 10000 --lr 1e-4 --device cuda --checkpoint_dir my_checkpoints
+```
+
+This will train the model for 10000 iterations with a learning rate of 1e-4, using a CUDA device for training. The model checkpoints will be saved in the `my_checkpoints` directory.
+
+&nbsp;
+
+### Generating New Text
+
+The `generate.py` script generates new text from a trained model. It accepts the following command line arguments:
+
+- `--checkpoint_path`: Path to the model checkpoint. This argument is required.
+- `--num_tokens`: Number of tokens to generate. Default is 100.
+
+Example usage:
+
+```shell
+python generate.py --checkpoint_path my_checkpoints/model_state_10000.pt --num_tokens 500
+```
+
+This will generate 500 new tokens from the model checkpoint at `my_checkpoints/model_state_10000.pt`.
diff --git a/decoder_transformer.py b/decoder_transformer.py
@@ -1,28 +1,27 @@
-from typing import Optional, Tuple
-
+from __future__ import annotations
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
 
-class EncoderTransformer(nn.Module):
+class DecoderTransformer(nn.Module):
     def __init__(
         self,
         num_blocks: int,
         num_heads: int,
         embed_size: int,
-        block_size: int,
+        context_size: int,
         vocab_size: int,
     ):
         super().__init__()
-        self.block_size = block_size
+        self.context_size = context_size
         self.vocab_size = vocab_size
         self.token_embedding_table = nn.Embedding(vocab_size, embed_size)
-        self.position_embedding_table = nn.Embedding(block_size, embed_size)
+        self.position_embedding_table = nn.Embedding(context_size, embed_size)
         head_size = embed_size // num_heads
         self.blocks = nn.Sequential(
             *[
-                Block(num_heads, head_size, embed_size, block_size)
+                Block(num_heads, head_size, embed_size, context_size)
                 for _ in range(num_blocks)
             ]
             + [nn.LayerNorm(embed_size)]
@@ -49,23 +48,23 @@ def forward(
 
         return logits, loss
 
-    def generate(self, idx: torch.Tensor, max_tokens: int) -> torch.Tensor:
+    def generate(self, context: torch.Tensor, num_tokens: int) -> torch.Tensor:
         # generate tokens
         with torch.no_grad():
-            for i in range(max_tokens):
-                cond_idx = idx[:, -self.block_size :]
-                logits, _ = self.forward(cond_idx)
+            for _ in range(num_tokens):
+                cond_context = context[:, -self.context_size :]
+                logits, _ = self.forward(cond_context)
                 logits = logits[:, -1, :]
                 probs = F.softmax(logits, dim=-1)
                 next_token = torch.multinomial(probs, 1)
-                idx = torch.cat((idx, next_token), dim=1)
-            return idx
+                context = torch.cat((context, next_token), dim=1)
+            return context
 
 
 class MultiHeadAttention(nn.Module):
     """
     A multi-head attention layer.
-    Takees in a number of heads retruen a concatenated output of all heads.
+    Takes in a number of heads returns a concatenated output of all heads.
     """
 
     def __init__(
@@ -148,7 +147,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 class Block(nn.Module):
     """
-    A single block of the Transformer.
+    A single transformer block.
     """
 
     def __init__(
diff --git a/generate.py b/generate.py
@@ -0,0 +1,64 @@
+import argparse
+
+import torch
+
+from decoder_transformer import DecoderTransformer
+
+
+def main():
+    # Define command line arguments
+    parser = argparse.ArgumentParser(
+        description="Generate text from a trained transformer model."
+    )
+    parser.add_argument(
+        "--checkpoint_path",
+        type=str,
+        required=True,
+        help="Path to the model checkpoint.",
+    )
+    parser.add_argument(
+        "--num_tokens", type=int, default=100, help="Number of tokens to generate."
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cuda" if torch.cuda.is_available() else "mps",
+        help="Device to use for training.",
+    )
+    args = parser.parse_args()
+
+    with open("verne.txt", "r") as f:
+        text = f.read()
+
+    device = torch.device(args.device)
+    vocab_size = len(set(text))
+    embed_size = 384
+    context_size = 256
+    num_heads = 6
+    num_blocks = 6
+
+    state_dict = torch.load(args.checkpoint_path)
+    encode = lambda x: [state_dict["encoder_dictionary"][c] for c in x]
+    decode = lambda x: "".join([state_dict["decoder_dictionary"][i] for i in x])
+
+    # Load the model from the checkpoint
+    model = DecoderTransformer(
+        num_blocks=num_blocks,
+        num_heads=num_heads,
+        embed_size=embed_size,
+        context_size=context_size,
+        vocab_size=vocab_size,
+    ).to(device)
+    model.load_state_dict(state_dict, strict=False)
+
+    encoded_context = (
+        torch.tensor(encode("The "), dtype=torch.long).unsqueeze(0).to(device)
+    )
+
+    # Generate text
+    generated_text = model.generate(encoded_context, args.num_tokens)
+    print(decode(generated_text.tolist()[0]))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/main.py b/main.py
diff --git a/train.py b/train.py
@@ -0,0 +1,115 @@
+import argparse
+import os
+from datetime import datetime
+
+import torch
+from tqdm import tqdm
+
+from decoder_transformer import DecoderTransformer
+
+with open("verne.txt", "r") as f:
+    text = f.read()
+
+vocab_size = len(set(text))
+batch_size = 64
+embed_size = 384
+context_size = 256
+num_heads = 6
+num_blocks = 6
+
+# Define command line arguments
+parser = argparse.ArgumentParser(description="Train a transformer model.")
+parser.add_argument(
+    "--iters", type=int, default=5000, help="Total iterations to train."
+)
+parser.add_argument("--lr", type=float, default=3e-4, help="Learning rate.")
+parser.add_argument(
+    "--device",
+    type=str,
+    default="cuda" if torch.cuda.is_available() else "mps",
+    help="Device to use for training.",
+)
+parser.add_argument(
+    "--checkpoint_dir",
+    type=str,
+    default="checkpoints",
+    help="Directory to save the model checkpoints.",
+)
+args = parser.parse_args()
+
+device = torch.device(args.device)
+learning_rate = args.lr
+total_iters = args.iters
+eval_iters = total_iters // 10
+
+# construct a character level tokenizer
+ctoi = {c: i for i, c in enumerate(set(text))}
+itoc = {i: c for i, c in enumerate(set(text))}
+encode = lambda x: [ctoi[c] for c in x]
+decode = lambda x: "".join([itoc[i] for i in x])
+
+data = torch.tensor(encode(text), dtype=torch.long)
+n = int(len(data) * 0.9)
+train_data = data[:n]
+val_data = data[n:]
+
+
+def get_batch(split):
+    data = train_data if split == "train" else val_data
+    ix = torch.randint(0, len(data) - context_size, (batch_size,))
+    x = torch.stack([data[i : i + context_size] for i in ix])
+    y = torch.stack([data[i + 1 : i + context_size + 1] for i in ix])
+    return x.to(device), y.to(device)
+
+
+@torch.no_grad()
+def eval_loss(model):
+    model.eval()
+    out = {}
+    for split in ["train", "val"]:
+        losses = torch.zeros(eval_iters)
+        for i in range(eval_iters):
+            x, y = get_batch(split)
+            _, loss = model(x, y)
+            losses[i] = loss.item()
+        out[split] = losses.mean().item()
+    model.train()
+    return out
+
+
+start_time = datetime.now().strftime("%Y%m%d_%H%M")
+
+model = DecoderTransformer(
+    num_blocks=num_blocks,
+    num_heads=num_heads,
+    embed_size=embed_size,
+    context_size=context_size,
+    vocab_size=vocab_size,
+).to(device)
+optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
+progress_bar = tqdm(range(total_iters))
+checkpoint_dir = args.checkpoint_dir
+os.makedirs(checkpoint_dir, exist_ok=True)
+
+# train the model
+for i in progress_bar:
+    model.train()
+    x, y = get_batch("train")
+    logits, loss = model(x, y)
+    optimizer.zero_grad()
+    loss.backward()
+    optimizer.step()
+
+    if i % eval_iters == 0 and i > 0:
+        # Save the model state
+        state_dict = model.state_dict()
+        state_dict["encoder_dictionary"] = ctoi
+        state_dict["decoder_dictionary"] = itoc
+        torch.save(
+            state_dict,
+            os.path.join(checkpoint_dir, f"{start_time}_model_state_{i}.pt"),
+        )
+
+        # Log the losses
+        losses = eval_loss(model)
+        progress_bar.set_postfix(losses)

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+checkpoints/`
	`2`	`+.DS_Store`
`1`	`3`	`# Byte-compiled / optimized / DLL files`
`2`	`4`	`__pycache__/`
`3`	`5`	`*.py[cod]`