Add first bigram model

joaoflf · joaoflf · commit da2566c2086f · 2023-03-19T10:45:16.000Z
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,22 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Cython debug symbols
+cython_debug/
diff --git a/prototype.ipynb b/prototype.ipynb
@@ -0,0 +1,211 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "from torch.nn import functional as F"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open('verne.txt', 'r') as f:\n",
+    "    text = f.read()\n",
+    "\n",
+    "vocab_size = len(set(text))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# construct a character level tokenizer\n",
+    "ctoi = {c:i for i,c in enumerate(set(text))}\n",
+    "itoc = {i:c for i,c in enumerate(set(text))}\n",
+    "encode = lambda x: [ctoi[c] for c in x]\n",
+    "decode = lambda x: ''.join([itoc[i] for i in x])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = torch.tensor(encode(text), dtype=torch.long)\n",
+    "\n",
+    "n = int(len(data) *.9)\n",
+    "train_data = data[:n]\n",
+    "val_data = data[n:]\n",
+    "\n",
+    "batch_size = 32\n",
+    "block_size = 8\n",
+    "device = torch.device('mps')\n",
+    "\n",
+    "def get_batch(split):\n",
+    "    data = train_data if split == 'train' else val_data\n",
+    "    ix = torch.randint(0, len(data) - block_size, (batch_size,))\n",
+    "    x = torch.stack([data[i:i+block_size] for i in ix])\n",
+    "    y = torch.stack([data[i+1:i+block_size+1] for i in ix])\n",
+    "    return x.to(device), y.to(device)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class BigramLanguageModel(nn.Module):\n",
+    "    def __init__(self, vocab_size: int):\n",
+    "        super().__init__()\n",
+    "        #construct a lookup table where each row corresponds to each token\n",
+    "        #and contains the logits for the next tokcn\n",
+    "        self.embedding_table= nn.Embedding(vocab_size, vocab_size)\n",
+    "\n",
+    "    def forward(self, idx:torch.Tensor, target:torch.Tensor | None = None) -> tuple[torch.Tensor, torch.Tensor | None]:\n",
+    "        #look up the logits for the next token\n",
+    "        logits = self.embedding_table(idx)\n",
+    "\n",
+    "        if target is None:\n",
+    "            loss = None\n",
+    "        else:\n",
+    "            #compute the loss\n",
+    "            B, T, C = logits.shape\n",
+    "            logits = logits.view(B*T, C)\n",
+    "            loss = F.cross_entropy(logits, target.view(-1))\n",
+    "        return logits, loss\n",
+    "\n",
+    "    def generate(self, idx: torch.Tensor, max_tokens:int) -> torch.Tensor:\n",
+    "        #generate tokens\n",
+    "        with torch.no_grad():\n",
+    "            for _ in range(max_tokens):\n",
+    "                logits, loss = self.forward(idx)\n",
+    "                logits = logits[:, -1, :]\n",
+    "                probs = F.softmax(logits, dim=-1)\n",
+    "                next_token = torch.multinomial(probs, 1)\n",
+    "                idx = torch.cat((idx, next_token), dim=1)\n",
+    "            return idx"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "torch.Size([32, 8])\n",
+      "tensor(5.1389, device='mps:0', grad_fn=<NllLossBackward0>)\n",
+      "0£h Œi((“WI_+z:YyNXn=-1”_Tr5i﻿£:oN“3$\n",
+      "°m/zfŒ\"EfYM5>:3&OgPŒ,‘J-6i1/_V_″vfS7I@FnCé=—A\n",
+      "N2:i57ï/)X1!nEb,>\n"
+     ]
+    }
+   ],
+   "source": [
+    "torch.manual_seed(1337)\n",
+    "bigram = BigramLanguageModel(vocab_size).to(device) \n",
+    "x, y = get_batch('train')\n",
+    "\n",
+    "print(x.shape)\n",
+    "\n",
+    "logits, loss = bigram(x,y)\n",
+    "print(loss)\n",
+    "\n",
+    "print(decode(bigram.generate(torch.zeros(1,1, dtype=torch.long, device=device), 100)[0].tolist()))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "optimizer = torch.optim.AdamW(bigram.parameters(), lr=1e-3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2.5752177238464355\n"
+     ]
+    }
+   ],
+   "source": [
+    "for i in range(10000):\n",
+    "    x,y = get_batch('train')\n",
+    "    logits, loss = bigram(x,y)\n",
+    "    optimizer.zero_grad()\n",
+    "    loss.backward()\n",
+    "    optimizer.step()\n",
+    "print(loss.item())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0\n",
+      "The borven iove s theannokintwaim, we---trs to ar-o ad anted I ves, “Hibouerthedeloke ier Theatis l\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(decode(bigram.generate(torch.zeros(1,1, dtype=torch.long, device=device), 100)[0].tolist()))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "verne-encoder-transformer",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/verne.txt b/verne.txt