diff options
author | gusalsdmlwlq <gusalsrhkals@naver.com> | 2020-08-05 09:12:23 +0300 |
---|---|---|
committer | gusalsdmlwlq <gusalsrhkals@naver.com> | 2020-08-05 09:12:23 +0300 |
commit | c091f894a8f767e0b15d20720668943a58ab1313 (patch) | |
tree | 2fe87b8d97f1f60c67f986e68d32f598ea2a01a1 | |
parent | 72ccd5f13b6e2a2de6e0dd879905a912171d1022 (diff) |
add new tokens tutorial
-rw-r--r-- | python/add_new_vocab.ipynb | 138 |
1 files changed, 138 insertions, 0 deletions
diff --git a/python/add_new_vocab.ipynb b/python/add_new_vocab.ipynb new file mode 100644 index 0000000..fb2dedc --- /dev/null +++ b/python/add_new_vocab.ipynb @@ -0,0 +1,138 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### You can add new special tokens to pre-trained sentencepiece model\n", + "#### Run this code in google/sentencepiece/python/" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load pre-trained sentencepiece model\n", + "Pre-trained model is needed" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "371391" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import sentencepiece_model_pb2 as model\n", + "m = model.ModelProto()\n", + "m.ParseFromString(open(\"old.model\", \"rb\").read())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load tokens want to add\n", + "Prepare the list of new tokens want to add" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['[UNK]',\n", + " '[PAD]',\n", + " '[CLS]',\n", + " '[SEP]',\n", + " '[MASK]',\n", + " '[EOS]',\n", + " '[DOMAIN]',\n", + " '[SLOT]',\n", + " '[ACTION]']" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "special_tokens = open(\"special_tokens.txt\", \"r\").read().split(\"\\n\")\n", + "special_tokens" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Add new tokens to sentencepiece model" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "for token in special_tokens:\n", + " new_token = model.ModelProto().SentencePiece()\n", + " new_token.piece = token\n", + " new_token.score = 0\n", + " m.pieces.append(new_token)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Save new sentencepiece model\n", + "Load the new sentencepiece model to your NLP system" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "with open('new.model', 'wb') as f:\n", + " f.write(m.SerializeToString())" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.10" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} |