add new tokens tutorial

author: gusalsdmlwlq <gusalsrhkals@naver.com> 2020-08-05 09:12:23 +0300
committer: gusalsdmlwlq <gusalsrhkals@naver.com> 2020-08-05 09:12:23 +0300
commit: c091f894a8f767e0b15d20720668943a58ab1313 (patch)
tree: 2fe87b8d97f1f60c67f986e68d32f598ea2a01a1
parent: 72ccd5f13b6e2a2de6e0dd879905a912171d1022 (diff)
1 files changed, 138 insertions, 0 deletions
diff --git a/python/add_new_vocab.ipynb b/python/add_new_vocab.ipynb
new file mode 100644
index 0000000..fb2dedc
--- /dev/null
+++ b/python/add_new_vocab.ipynb
@@ -0,0 +1,138 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### You can add new special tokens to pre-trained sentencepiece model\n",
+    "#### Run this code in google/sentencepiece/python/"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Load pre-trained sentencepiece model\n",
+    "Pre-trained model is needed"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "371391"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import sentencepiece_model_pb2 as model\n",
+    "m = model.ModelProto()\n",
+    "m.ParseFromString(open(\"old.model\", \"rb\").read())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Load tokens want to add\n",
+    "Prepare the list of new tokens want to add"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['[UNK]',\n",
+       " '[PAD]',\n",
+       " '[CLS]',\n",
+       " '[SEP]',\n",
+       " '[MASK]',\n",
+       " '[EOS]',\n",
+       " '[DOMAIN]',\n",
+       " '[SLOT]',\n",
+       " '[ACTION]']"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "special_tokens = open(\"special_tokens.txt\", \"r\").read().split(\"\\n\")\n",
+    "special_tokens"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Add new tokens to sentencepiece model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for token in special_tokens:\n",
+    "    new_token = model.ModelProto().SentencePiece()\n",
+    "    new_token.piece = token\n",
+    "    new_token.score = 0\n",
+    "    m.pieces.append(new_token)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Save new sentencepiece model\n",
+    "Load the new sentencepiece model to your NLP system"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open('new.model', 'wb') as f:\n",
+    "    f.write(m.SerializeToString())"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
author	gusalsdmlwlq <gusalsrhkals@naver.com>	2020-08-05 09:12:23 +0300
committer	gusalsdmlwlq <gusalsrhkals@naver.com>	2020-08-05 09:12:23 +0300
commit	c091f894a8f767e0b15d20720668943a58ab1313 (patch)
tree	2fe87b8d97f1f60c67f986e68d32f598ea2a01a1
parent	72ccd5f13b6e2a2de6e0dd879905a912171d1022 (diff)