pre-training bert

Kim-kwan-woo · Oct 5, 2021 · f2a9039 · f2a9039
1 parent 1d31c6f
commit f2a9039
Show file tree

Hide file tree

Showing 4 changed files with 15,758 additions and 0 deletions.
diff --git a/bert_train.ipynb b/bert_train.ipynb
@@ -0,0 +1,232 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Importing Libraries and Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import tensorflow as tf\n",
+    "from tensorflow.keras.layers import Dense, Input\n",
+    "from tensorflow.keras.optimizers import Adam\n",
+    "from tensorflow.keras.models import Model\n",
+    "from tensorflow.keras.callbacks import ModelCheckpoint\n",
+    "import tensorflow_hub as hub\n",
+    "\n",
+    "import re\n",
+    "from bert import tokenization\n",
+    "import string\n",
+    "tf.gfile = tf.io.gfile"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train = pd.read_csv(\"./data/train.csv\")\n",
+    "test = pd.read_csv(\"./data/test.csv\")\n",
+    "submission = pd.read_csv(\"./data/sample_submission.csv\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Helper functions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def bert_encode(texts, tokenizer, max_len=512):\n",
+    "    all_tokens = []\n",
+    "    all_masks = []\n",
+    "    all_segments = []\n",
+    "    \n",
+    "    for text in texts:\n",
+    "        text = tokenizer.tokenize(text)\n",
+    "            \n",
+    "        text = text[:max_len-2]\n",
+    "        input_sequence = [\"[CLS]\"] + text + [\"[SEP]\"]\n",
+    "        pad_len = max_len - len(input_sequence)\n",
+    "        \n",
+    "        tokens = tokenizer.convert_tokens_to_ids(input_sequence)\n",
+    "        tokens += [0] * pad_len\n",
+    "        pad_masks = [1] * len(input_sequence) + [0] * pad_len\n",
+    "        segment_ids = [0] * max_len\n",
+    "        \n",
+    "        all_tokens.append(tokens)\n",
+    "        all_masks.append(pad_masks)\n",
+    "        all_segments.append(segment_ids)\n",
+    "    \n",
+    "    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def build_model(bert_layer, max_len=512):\n",
+    "    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name=\"input_word_ids\")\n",
+    "    input_mask = Input(shape=(max_len,), dtype=tf.int32, name=\"input_mask\")\n",
+    "    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name=\"segment_ids\")\n",
+    "\n",
+    "    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])\n",
+    "    clf_output = sequence_output[:, 0, :]\n",
+    "    out = Dense(1, activation='sigmoid')(clf_output)\n",
+    "    \n",
+    "    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)\n",
+    "    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])\n",
+    "    \n",
+    "    return model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Data cleaning"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def lowercase_text(text):\n",
+    "    return text.lower()\n",
+    "\n",
+    "train.text=train.text.apply(lambda x: lowercase_text(x))\n",
+    "test.text=test.text.apply(lambda x: lowercase_text(x))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def remove_noise(text):\n",
+    "    text = re.sub('\\[.*?\\]', '', text)\n",
+    "    text = re.sub('https?://\\S+|www\\.\\S+', '', text)\n",
+    "    text = re.sub('<.*?>+', '', text)\n",
+    "    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)\n",
+    "    text = re.sub('\\n', '', text)\n",
+    "    text = re.sub('\\w*\\d\\w*', '', text)\n",
+    "    return text\n",
+    "\n",
+    "train.text=train.text.apply(lambda x: remove_noise(x))\n",
+    "test.text=test.text.apply(lambda x: remove_noise(x))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0    our deeds are the reason of this earthquake ma...\n",
+       "1                forest fire near la ronge sask canada\n",
+       "2    all residents asked to shelter in place are be...\n",
+       "3     people receive wildfires evacuation orders in...\n",
+       "4    just got sent this photo from ruby alaska as s...\n",
+       "Name: text, dtype: object"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "train.text.head(5)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Pre-Training (BERT)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "module_url = \"https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1\"\n",
+    "bert_layer = hub.KerasLayer(module_url, trainable=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()\n",
+    "do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()\n",
+    "tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_input = bert_encode(train.text.values, tokenizer, max_len=160)\n",
+    "test_input = bert_encode(test.text.values, tokenizer, max_len=160)\n",
+    "train_labels = train.target.values"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}