Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/sentencepiece.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorgusalsdmlwlq <gusalsrhkals@naver.com>2020-08-05 09:12:23 +0300
committergusalsdmlwlq <gusalsrhkals@naver.com>2020-08-05 09:12:23 +0300
commitc091f894a8f767e0b15d20720668943a58ab1313 (patch)
tree2fe87b8d97f1f60c67f986e68d32f598ea2a01a1
parent72ccd5f13b6e2a2de6e0dd879905a912171d1022 (diff)
add new tokens tutorial
-rw-r--r--python/add_new_vocab.ipynb138
1 files changed, 138 insertions, 0 deletions
diff --git a/python/add_new_vocab.ipynb b/python/add_new_vocab.ipynb
new file mode 100644
index 0000000..fb2dedc
--- /dev/null
+++ b/python/add_new_vocab.ipynb
@@ -0,0 +1,138 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### You can add new special tokens to pre-trained sentencepiece model\n",
+ "#### Run this code in google/sentencepiece/python/"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Load pre-trained sentencepiece model\n",
+ "Pre-trained model is needed"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "371391"
+ ]
+ },
+ "execution_count": 1,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import sentencepiece_model_pb2 as model\n",
+ "m = model.ModelProto()\n",
+ "m.ParseFromString(open(\"old.model\", \"rb\").read())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Load tokens want to add\n",
+ "Prepare the list of new tokens want to add"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['[UNK]',\n",
+ " '[PAD]',\n",
+ " '[CLS]',\n",
+ " '[SEP]',\n",
+ " '[MASK]',\n",
+ " '[EOS]',\n",
+ " '[DOMAIN]',\n",
+ " '[SLOT]',\n",
+ " '[ACTION]']"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "special_tokens = open(\"special_tokens.txt\", \"r\").read().split(\"\\n\")\n",
+ "special_tokens"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Add new tokens to sentencepiece model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "for token in special_tokens:\n",
+ " new_token = model.ModelProto().SentencePiece()\n",
+ " new_token.piece = token\n",
+ " new_token.score = 0\n",
+ " m.pieces.append(new_token)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Save new sentencepiece model\n",
+ "Load the new sentencepiece model to your NLP system"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "with open('new.model', 'wb') as f:\n",
+ " f.write(m.SerializeToString())"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.10"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}