diff options
-rw-r--r-- | README.md | 57 | ||||
-rw-r--r-- | docs/changelog.md | 44 | ||||
-rw-r--r-- | docs/guides/embeddings.md | 125 | ||||
-rw-r--r-- | docs/guides/quickstart.md | 62 | ||||
-rw-r--r-- | keybert/__init__.py | 3 | ||||
-rw-r--r-- | keybert/backend/__init__.py | 6 | ||||
-rw-r--r-- | keybert/backend/_base.py | 33 | ||||
-rw-r--r-- | keybert/backend/_flair.py | 72 | ||||
-rw-r--r-- | keybert/backend/_gensim.py | 71 | ||||
-rw-r--r-- | keybert/backend/_sentencetransformers.py | 54 | ||||
-rw-r--r-- | keybert/backend/_spacy.py | 99 | ||||
-rw-r--r-- | keybert/backend/_use.py | 54 | ||||
-rw-r--r-- | keybert/backend/_utils.py | 44 | ||||
-rw-r--r-- | keybert/model.py | 176 | ||||
-rw-r--r-- | mkdocs.yml | 9 | ||||
-rw-r--r-- | setup.py | 29 |
16 files changed, 747 insertions, 191 deletions
@@ -2,6 +2,7 @@ [![PyPI - License](https://img.shields.io/badge/license-MIT-green.svg)](https://github.com/MaartenGr/keybert/blob/master/LICENSE) [![PyPI - PyPi](https://img.shields.io/pypi/v/keyBERT)](https://pypi.org/project/keybert/) [![Build](https://img.shields.io/github/workflow/status/MaartenGr/keyBERT/Code%20Checks/master)](https://pypi.org/project/keybert/) +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1OxpgwKqSzODtO3vS7Xe1nEmZMCAIMckX?usp=sharing) <img src="images/logo.png" width="35%" height="35%" align="right" /> @@ -65,10 +66,19 @@ Installation can be done using [pypi](https://pypi.org/project/keybert/): pip install keybert ``` -To use Flair embeddings, install KeyBERT as follows: +You may want to install more depending on the transformers and language backends that you will be using. The possible installations are: ``` pip install keybert[flair] +pip install keybert[gensim] +pip install keybert[spacy] +pip install keybert[use] +``` + +To install all backends: + +``` +pip install keybert[all] ``` <a name="usage"/></a> @@ -90,14 +100,14 @@ doc = """ the learning algorithm to generalize from the training data to unseen situations in a 'reasonable' way (see inductive bias). """ -model = KeyBERT('distilbert-base-nli-mean-tokens') -keywords = model.extract_keywords(doc) +kw_model = KeyBERT('distilbert-base-nli-mean-tokens') +keywords = kw_model.extract_keywords(doc) ``` You can set `keyphrase_ngram_range` to set the length of the resulting keywords/keyphrases: ```python ->>> model.extract_keywords(doc, keyphrase_ngram_range=(1, 1), stop_words=None) +>>> kw_model.extract_keywords(doc, keyphrase_ngram_range=(1, 1), stop_words=None) [('learning', 0.4604), ('algorithm', 0.4556), ('training', 0.4487), @@ -109,7 +119,7 @@ To extract keyphrases, simply set `keyphrase_ngram_range` to (1, 2) or higher de of words you would like in the resulting keyphrases: ```python ->>> model.extract_keywords(doc, keyphrase_ngram_range=(1, 2), stop_words=None) +>>> kw_model.extract_keywords(doc, keyphrase_ngram_range=(1, 2), stop_words=None) [('learning algorithm', 0.6978), ('machine learning', 0.6305), ('supervised learning', 0.5985), @@ -125,13 +135,13 @@ have shown great performance in semantic similarity and paraphrase identificatio <a name="maxsum"/></a> ### 2.3. Max Sum Similarity -To diversity the results, we take the 2 x top_n most similar words/phrases to the document. +To diversify the results, we take the 2 x top_n most similar words/phrases to the document. Then, we take all top_n combinations from the 2 x top_n words and extract the combination that are the least similar to each other by cosine similarity. ```python ->>> model.extract_keywords(doc, keyphrase_ngram_range=(3, 3), stop_words='english', - use_maxsum=True, nr_candidates=20, top_n=5) +>>> kw_model.extract_keywords(doc, keyphrase_ngram_range=(3, 3), stop_words='english', + use_maxsum=True, nr_candidates=20, top_n=5) [('set training examples', 0.7504), ('generalize training data', 0.7727), ('requires learning algorithm', 0.5050), @@ -148,8 +158,8 @@ keywords / keyphrases which is also based on cosine similarity. The results with **high diversity**: ```python ->>> model.extract_keywords(doc, keyphrase_ngram_range=(3, 3), stop_words='english', - use_mmr=True, diversity=0.7) +>>> kw_model.extract_keywords(doc, keyphrase_ngram_range=(3, 3), stop_words='english', + use_mmr=True, diversity=0.7) [('algorithm generalize training', 0.7727), ('labels unseen instances', 0.1649), ('new examples optimal', 0.4185), @@ -160,8 +170,8 @@ with **high diversity**: The results with **low diversity**: ```python ->>> model.extract_keywords(doc, keyphrase_ngram_range=(3, 3), stop_words='english', - use_mmr=True, diversity=0.2) +>>> kw_model.extract_keywords(doc, keyphrase_ngram_range=(3, 3), stop_words='english', + use_mmr=True, diversity=0.2) [('algorithm generalize training', 0.7727), ('supervised learning algorithm', 0.7502), ('learning machine learning', 0.7577), @@ -172,8 +182,15 @@ The results with **low diversity**: <a name="embeddings"/></a> ### 2.5. Embedding Models -The parameter `model` takes in a string pointing to a sentence-transformers model, -a SentenceTransformer, or a Flair DocumentEmbedding model. +KeyBERT supports many embedding models that can be used to embed the documents and words: + +* Sentence-Transformers +* Flair +* Spacy +* Gensim +* USE + +Click [here](https://maartengr.github.io/KeyBERT/guides/embeddings.html) for a full overview of all supported embedding models. **Sentence-Transformers** You can select any model from `sentence-transformers` [here](https://www.sbert.net/docs/pretrained_models.html) @@ -181,7 +198,7 @@ and pass it through KeyBERT with `model`: ```python from keybert import KeyBERT -model = KeyBERT(model='distilbert-base-nli-mean-tokens') +kw_model = KeyBERT(model='distilbert-base-nli-mean-tokens') ``` Or select a SentenceTransformer model with your own parameters: @@ -191,7 +208,7 @@ from keybert import KeyBERT from sentence_transformers import SentenceTransformer sentence_model = SentenceTransformer("distilbert-base-nli-mean-tokens", device="cpu") -model = KeyBERT(model=sentence_model) +kw_model = KeyBERT(model=sentence_model) ``` **Flair** @@ -203,7 +220,7 @@ from keybert import KeyBERT from flair.embeddings import TransformerDocumentEmbeddings roberta = TransformerDocumentEmbeddings('roberta-base') -model = KeyBERT(model=roberta) +kw_model = KeyBERT(model=roberta) ``` You can select any 🤗 transformers model [here](https://huggingface.co/models). @@ -218,7 +235,7 @@ To cite PolyFuzz in your work, please use the following bibtex reference: title = {KeyBERT: Minimal keyword extraction with BERT.}, year = 2020, publisher = {Zenodo}, - version = {v0.1.3}, + version = {v0.3.0}, doi = {10.5281/zenodo.4461265}, url = {https://doi.org/10.5281/zenodo.4461265} } @@ -238,10 +255,10 @@ but most importantly, these are amazing resources for creating impressive keywor * https://github.com/swisscom/ai-research-keyphrase-extraction **MMR**: -The selection of keywords/keyphrases was modelled after: +The selection of keywords/keyphrases was modeled after: * https://github.com/swisscom/ai-research-keyphrase-extraction **NOTE**: If you find a paper or github repo that has an easy-to-use implementation of BERT-embeddings for keyword/keyphrase extraction, let me know! I'll make sure to -add it a reference to this repo. +add a reference to this repo. diff --git a/docs/changelog.md b/docs/changelog.md new file mode 100644 index 0000000..52e1b05 --- /dev/null +++ b/docs/changelog.md @@ -0,0 +1,44 @@ +## **Version 0.3.0** +*Release date: 10 May, 2021* + +The two main features are **candidate keywords** +and several **backends** to use instead of Flair and SentenceTransformers! + +**Highlights**: + +* Use candidate words instead of extracting those from the documents ([#25](https://github.com/MaartenGr/KeyBERT/issues/25)) + * ```KeyBERT().extract_keywords(doc, candidates)``` +* Spacy, Gensim, USE, and Custom Backends were added (see documentation [here](https://maartengr.github.io/KeyBERT/guides/embeddings.html)) + +**Fixes**: + +* Improved imports +* Fix encoding error when locally installing KeyBERT ([#30](https://github.com/MaartenGr/KeyBERT/issues/30)) + +**Miscellaneous**: + +* Improved documentation (ReadMe & MKDocs) +* Add the main tutorial as a shield +* Typos ([#31](https://github.com/MaartenGr/KeyBERT/pull/31), [#35](https://github.com/MaartenGr/KeyBERT/pull/35)) + + +## **Version 0.2.0** +*Release date: 9 Feb, 2021* + +**Highlights**: + +* Add similarity scores to the output +* Add Flair as a possible back-end +* Update documentation + improved testing + +## **Version 0.1.2* +*Release date: 28 Oct, 2020* + +Added Max Sum Similarity as an option to diversify your results. + + +## **Version 0.1.0** +*Release date: 27 Oct, 2020* + +This first release includes keyword/keyphrase extraction using BERT and simple cosine similarity. +There is also an option to use Maximal Marginal Relevance to select the candidate keywords/keyphrases. diff --git a/docs/guides/embeddings.md b/docs/guides/embeddings.md index bc1b3d9..3faedad 100644 --- a/docs/guides/embeddings.md +++ b/docs/guides/embeddings.md @@ -1,36 +1,137 @@ -## **Embedding Models** -The parameter `model` takes in a string pointing to a sentence-transformers model, -a SentenceTransformer, or a Flair DocumentEmbedding model. +# Embedding Models +In this tutorial we will be going through the embedding models that can be used in KeyBERT. +Having the option to choose embedding models allow you to leverage pre-trained embeddings that suit your use-case. -### **Sentence-Transformers** -You can select any model from `sentence-transformers` [here](https://www.sbert.net/docs/pretrained_models.html) +### **Sentence Transformers** +You can select any model from sentence-transformers [here](https://www.sbert.net/docs/pretrained_models.html) and pass it through KeyBERT with `model`: ```python from keybert import KeyBERT -model = KeyBERT(model='distilbert-base-nli-mean-tokens') +kw_model = KeyBERT(model="xlm-r-bert-base-nli-stsb-mean-tokens") ``` Or select a SentenceTransformer model with your own parameters: ```python -from keybert import KeyBERT from sentence_transformers import SentenceTransformer -sentence_model = SentenceTransformer("distilbert-base-nli-mean-tokens", device="cpu") -model = KeyBERT(model=sentence_model) +sentence_model = SentenceTransformer("distilbert-base-nli-mean-tokens", device="cuda") +kw_model = KeyBERT(model=sentence_model) ``` -### **Flair** +### **Flair** [Flair](https://github.com/flairNLP/flair) allows you to choose almost any embedding model that is publicly available. Flair can be used as follows: ```python -from keybert import KeyBERT from flair.embeddings import TransformerDocumentEmbeddings roberta = TransformerDocumentEmbeddings('roberta-base') -model = KeyBERT(model=roberta) +kw_model = KeyBERT(model=roberta) ``` You can select any 🤗 transformers model [here](https://huggingface.co/models). + +Moreover, you can also use Flair to use word embeddings and pool them to create document embeddings. +Under the hood, Flair simply averages all word embeddings in a document. Then, we can easily +pass it to KeyBERT in order to use those word embeddings as document embeddings: + +```python +from flair.embeddings import WordEmbeddings, DocumentPoolEmbeddings + +glove_embedding = WordEmbeddings('crawl') +document_glove_embeddings = DocumentPoolEmbeddings([glove_embedding]) + +kw_model = KeyBERT(model=document_glove_embeddings) +``` + +### **Spacy** +[Spacy](https://github.com/explosion/spaCy) is an amazing framework for processing text. There are +many models available across many languages for modeling text. + + allows you to choose almost any embedding model that +is publicly available. Flair can be used as follows: + +To use Spacy's non-transformer models in KeyBERT: + +```python +import spacy + +nlp = spacy.load("en_core_web_md", exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']) + +kw_model = KeyBERT(model=document_glove_embeddings)nlp +``` + +Using spacy-transformer models: + +```python +import spacy + +spacy.prefer_gpu() +nlp = spacy.load("en_core_web_trf", exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']) + +kw_model = KeyBERT(model=nlp) +``` + +If you run into memory issues with spacy-transformer models, try: + +```python +import spacy +from thinc.api import set_gpu_allocator, require_gpu + +nlp = spacy.load("en_core_web_trf", exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']) +set_gpu_allocator("pytorch") +require_gpu(0) + +kw_model = KeyBERT(model=nlp) +``` + +### **Universal Sentence Encoder (USE)** +The Universal Sentence Encoder encodes text into high dimensional vectors that are used here +for embedding the documents. The model is trained and optimized for greater-than-word length text, +such as sentences, phrases or short paragraphs. + +Using USE in KeyBERT is rather straightforward: + +```python +import tensorflow_hub +embedding_model = tensorflow_hub.load("https://tfhub.dev/google/universal-sentence-encoder/4") +kw_model = KeyBERT(model=embedding_model) +``` + +### **Gensim** +For Gensim, KeyBERT supports its `gensim.downloader` module. Here, we can download any model word embedding model +to be used in KeyBERT. Note that Gensim is primarily used for Word Embedding models. This works typically +best for short documents since the word embeddings are pooled. + +```python +import gensim.downloader as api +ft = api.load('fasttext-wiki-news-subwords-300') +kw_model = KeyBERT(model=ft) +``` + +### **Custom Backend** +If your backend or model cannot be found in the ones currently available, you can use the `keybert.backend.BaseEmbedder` class to +create your own backend. Below, you will find an example of creating a SentenceTransformer backend for KeyBERT: + +```python +from keybert.backend import BaseEmbedder +from sentence_transformers import SentenceTransformer + +class CustomEmbedder(BaseEmbedder): + def __init__(self, embedding_model): + super().__init__() + self.embedding_model = embedding_model + + def embed(self, documents, verbose=False): + embeddings = self.embedding_model.encode(documents, show_progress_bar=verbose) + return embeddings + +# Create custom backend +distilbert = SentenceTransformer("distilbert-base-nli-stsb-mean-tokens") +custom_embedder = CustomEmbedder(embedding_model=distilbert) + +# Pass custom backend to keybert +kw_model = KeyBERT(model=custom_embedder) +``` diff --git a/docs/guides/quickstart.md b/docs/guides/quickstart.md index fc23615..d5bf16b 100644 --- a/docs/guides/quickstart.md +++ b/docs/guides/quickstart.md @@ -1,18 +1,20 @@ ## **Installation** -Installation can be done using [pypi](https://pypi.org/project/bertopic/): +Installation can be done using [pypi](https://pypi.org/project/keybert/): ``` pip install keybert ``` -To use Flair embeddings, install KeyBERT as follows: +You may want to install more depending on the transformers and language backends that you will be using. The possible installations are: ``` pip install keybert[flair] +pip install keybert[gensim] +pip install keybert[spacy] +pip install keybert[use] ``` -Or to install all additional dependencies: - +To install all backends: ``` pip install keybert[all] @@ -36,14 +38,14 @@ doc = """ the learning algorithm to generalize from the training data to unseen situations in a 'reasonable' way (see inductive bias). """ -model = KeyBERT('distilbert-base-nli-mean-tokens') -keywords = model.extract_keywords(doc) +kw_model = KeyBERT('distilbert-base-nli-mean-tokens') +keywords = kw_model.extract_keywords(doc) ``` You can set `keyphrase_ngram_range` to set the length of the resulting keywords/keyphrases: ```python ->>> model.extract_keywords(doc, keyphrase_ngram_range=(1, 1), stop_words=None) +>>> kw_model.extract_keywords(doc, keyphrase_ngram_range=(1, 1), stop_words=None) [('learning', 0.4604), ('algorithm', 0.4556), ('training', 0.4487), @@ -55,7 +57,7 @@ To extract keyphrases, simply set `keyphrase_ngram_range` to (1, 2) or higher de of words you would like in the resulting keyphrases: ```python ->>> model.extract_keywords(doc, keyphrase_ngram_range=(1, 2), stop_words=None) +>>> kw_model.extract_keywords(doc, keyphrase_ngram_range=(1, 2), stop_words=None) [('learning algorithm', 0.6978), ('machine learning', 0.6305), ('supervised learning', 0.5985), @@ -69,13 +71,13 @@ have shown great performance in semantic similarity and paraphrase identificatio ### Max Sum Similarity -To diversity the results, we take the 2 x top_n most similar words/phrases to the document. +To diversify the results, we take the 2 x top_n most similar words/phrases to the document. Then, we take all top_n combinations from the 2 x top_n words and extract the combination that are the least similar to each other by cosine similarity. ```python ->>> model.extract_keywords(doc, keyphrase_ngram_range=(3, 3), stop_words='english', - use_maxsum=True, nr_candidates=20, top_n=5) +>>> kw_model.extract_keywords(doc, keyphrase_ngram_range=(3, 3), stop_words='english', + use_maxsum=True, nr_candidates=20, top_n=5) [('set training examples', 0.7504), ('generalize training data', 0.7727), ('requires learning algorithm', 0.5050), @@ -90,8 +92,8 @@ keywords / keyphrases which is also based on cosine similarity. The results with **high diversity**: ```python ->>> model.extract_keywords(doc, keyphrase_ngram_range=(3, 3), stop_words='english', - use_mmr=True, diversity=0.7) +>>> kw_model.extract_keywords(doc, keyphrase_ngram_range=(3, 3), stop_words='english', + use_mmr=True, diversity=0.7) [('algorithm generalize training', 0.7727), ('labels unseen instances', 0.1649), ('new examples optimal', 0.4185), @@ -102,11 +104,41 @@ with **high diversity**: The results with **low diversity**: ```python ->>> model.extract_keywords(doc, keyphrase_ngram_range=(3, 3), stop_words='english', - use_mmr=True, diversity=0.2) +>>> kw_model.extract_keywords(doc, keyphrase_ngram_range=(3, 3), stop_words='english', + use_mmr=True, diversity=0.2) [('algorithm generalize training', 0.7727), ('supervised learning algorithm', 0.7502), ('learning machine learning', 0.7577), ('learning algorithm analyzes', 0.7587), ('learning algorithm generalize', 0.7514)] ``` + +### Candidate Keywords/Keyphrases +In some cases, one might want to be using candidate keywords generated by other keyword algorithms or retrieved from a select list of possible keywords/keyphrases. In KeyBERT, you can easily use those candidate keywords to perform keyword extraction: + +```python +import yake +from keybert import KeyBERT + +doc = """ + Supervised learning is the machine learning task of learning a function that + maps an input to an output based on example input-output pairs.[1] It infers a + function from labeled training data consisting of a set of training examples.[2] + In supervised learning, each example is a pair consisting of an input object + (typically a vector) and a desired output value (also called the supervisory signal). + A supervised learning algorithm analyzes the training data and produces an inferred function, + which can be used for mapping new examples. An optimal scenario will allow for the + algorithm to correctly determine the class labels for unseen instances. This requires + the learning algorithm to generalize from the training data to unseen situations in a + 'reasonable' way (see inductive bias). + """ + +# Create candidates +kw_extractor = yake.KeywordExtractor(top=50) +candidates = kw_extractor.extract_keywords(doc) +candidates = [candidate[0] for candidate in candidates] + +# KeyBERT init +kw_model = KeyBERT() +keywords = kw_model.extract_keywords(doc, candidates) +```
\ No newline at end of file diff --git a/keybert/__init__.py b/keybert/__init__.py index dcb4c10..1f3b6e2 100644 --- a/keybert/__init__.py +++ b/keybert/__init__.py @@ -1,2 +1,3 @@ from keybert.model import KeyBERT -__version__ = "0.2.0" + +__version__ = "0.3.0" diff --git a/keybert/backend/__init__.py b/keybert/backend/__init__.py new file mode 100644 index 0000000..25fdcbf --- /dev/null +++ b/keybert/backend/__init__.py @@ -0,0 +1,6 @@ + +from ._base import BaseEmbedder + +__all__ = [ + "BaseEmbedder" +]
\ No newline at end of file diff --git a/keybert/backend/_base.py b/keybert/backend/_base.py new file mode 100644 index 0000000..eefb4b7 --- /dev/null +++ b/keybert/backend/_base.py @@ -0,0 +1,33 @@ +import numpy as np +from typing import List + + +class BaseEmbedder: + """ The Base Embedder used for creating embedding models + Arguments: + embedding_model: The main embedding model to be used for extracting + document and word embedding + word_embedding_model: The embedding model used for extracting word + embeddings only. If this model is selected, + then the `embedding_model` is purely used for + creating document embeddings. + """ + def __init__(self, + embedding_model=None, + word_embedding_model=None): + self.embedding_model = embedding_model + self.word_embedding_model = word_embedding_model + + def embed(self, + documents: List[str], + verbose: bool = False) -> np.ndarray: + """ Embed a list of n documents/words into an n-dimensional + matrix of embeddings + Arguments: + documents: A list of documents or words to be embedded + verbose: Controls the verbosity of the process + Returns: + Document/words embeddings with shape (n, m) with `n` documents/words + that each have an embeddings size of `m` + """ + pass diff --git a/keybert/backend/_flair.py b/keybert/backend/_flair.py new file mode 100644 index 0000000..3e30cfd --- /dev/null +++ b/keybert/backend/_flair.py @@ -0,0 +1,72 @@ +import numpy as np +from tqdm import tqdm +from typing import Union, List +from flair.data import Sentence +from flair.embeddings import DocumentEmbeddings, TokenEmbeddings, DocumentPoolEmbeddings + +from keybert.backend import BaseEmbedder + + +class FlairBackend(BaseEmbedder): + """ Flair Embedding Model + The Flair embedding model used for generating document and + word embeddings. + Arguments: + embedding_model: A Flair embedding model + Usage: + ```python + from keybert.backend import FlairBackend + from flair.embeddings import WordEmbeddings, DocumentPoolEmbeddings + + # Create a Flair Embedding model + glove_embedding = WordEmbeddings('crawl') + document_glove_embeddings = DocumentPoolEmbeddings([glove_embedding]) + + # Pass the Flair model to create a new backend + flair_embedder = FlairBackend(document_glove_embeddings) + ``` + """ + def __init__(self, embedding_model: Union[TokenEmbeddings, DocumentEmbeddings]): + super().__init__() + + # Flair word embeddings + if isinstance(embedding_model, TokenEmbeddings): + self.embedding_model = DocumentPoolEmbeddings([embedding_model]) + + # Flair document embeddings + disable fine tune to prevent CUDA OOM + # https://github.com/flairNLP/flair/issues/1719 + elif isinstance(embedding_model, DocumentEmbeddings): + if "fine_tune" in embedding_model.__dict__: + embedding_model.fine_tune = False + self.embedding_model = embedding_model + + else: + raise ValueError("Please select a correct Flair model by either using preparing a token or document " + "embedding model: \n" + "`from flair.embeddings import TransformerDocumentEmbeddings` \n" + "`roberta = TransformerDocumentEmbeddings('roberta-base')`") + + def embed(self, + documents: List[str], + verbose: bool = False) -> np.ndarray: + """ Embed a list of n documents/words into an n-dimensional + matrix of embeddings + Arguments: + documents: A list of documents or words to be embedded + verbose: Controls the verbosity of the process + Returns: + Document/words embeddings with shape (n, m) with `n` documents/words + that each have an embeddings size of `m` + """ + embeddings = [] + for index, document in tqdm(enumerate(documents), disable=not verbose): + try: + sentence = Sentence(document) if document else Sentence("an empty document") + self.embedding_model.embed(sentence) + except RuntimeError: + sentence = Sentence("an empty document") + self.embedding_model.embed(sentence) + embedding = sentence.embedding.detach().cpu().numpy() + embeddings.append(embedding) + embeddings = np.asarray(embeddings) + return embeddings
\ No newline at end of file diff --git a/keybert/backend/_gensim.py b/keybert/backend/_gensim.py new file mode 100644 index 0000000..225d9f4 --- /dev/null +++ b/keybert/backend/_gensim.py @@ -0,0 +1,71 @@ +import numpy as np +from tqdm import tqdm +from typing import List +from keybert.backend import BaseEmbedder +from gensim.models.keyedvectors import Word2VecKeyedVectors + + +class GensimBackend(BaseEmbedder): + """ Gensim Embedding Model + + The Gensim embedding model is typically used for word embeddings with + GloVe, Word2Vec or FastText. + + Arguments: + embedding_model: A Gensim embedding model + + Usage: + + ```python + from keybert.backend import GensimBackend + import gensim.downloader as api + + ft = api.load('fasttext-wiki-news-subwords-300') + ft_embedder = GensimBackend(ft) + ``` + """ + def __init__(self, embedding_model: Word2VecKeyedVectors): + super().__init__() + + if isinstance(embedding_model, Word2VecKeyedVectors): + self.embedding_model = embedding_model + else: + raise ValueError("Please select a correct Gensim model: \n" + "`import gensim.downloader as api` \n" + "`ft = api.load('fasttext-wiki-news-subwords-300')`") + + def embed(self, + documents: List[str], + verbose: bool = False) -> np.ndarray: + """ Embed a list of n documents/words into an n-dimensional + matrix of embeddings + + Arguments: + documents: A list of documents or words to be embedded + verbose: Controls the verbosity of the process + + Returns: + Document/words embeddings with shape (n, m) with `n` documents/words + that each have an embeddings size of `m` + """ + vector_shape = self.embedding_model.word_vec(list(self.embedding_model.vocab.keys())[0]).shape + empty_vector = np.zeros(vector_shape[0]) + + embeddings = [] + for doc in tqdm(documents, disable=not verbose, position=0, leave=True): + doc_embedding = [] + + # Extract word embeddings + for word in doc.split(" "): + try: + word_embedding = self.embedding_model.word_vec(word) + doc_embedding.append(word_embedding) + except KeyError: + doc_embedding.append(empty_vector) + + # Pool word embeddings + doc_embedding = np.mean(doc_embedding, axis=0) + embeddings.append(doc_embedding) + + embeddings = np.array(embeddings) + return embeddings diff --git a/keybert/backend/_sentencetransformers.py b/keybert/backend/_sentencetransformers.py new file mode 100644 index 0000000..6b998f7 --- /dev/null +++ b/keybert/backend/_sentencetransformers.py @@ -0,0 +1,54 @@ +import numpy as np +from typing import List, Union +from sentence_transformers import SentenceTransformer + +from keybert.backend import BaseEmbedder + + +class SentenceTransformerBackend(BaseEmbedder): + """ Sentence-transformers embedding model + The sentence-transformers embedding model used for generating document and + word embeddings. + Arguments: + embedding_model: A sentence-transformers embedding model + Usage: + To create a model, you can load in a string pointing to a + sentence-transformers model: + ```python + from keybert.backend import SentenceTransformerBackend + sentence_model = SentenceTransformerBackend("distilbert-base-nli-stsb-mean-tokens") + ``` + or you can instantiate a model yourself: + ```python + from keybert.backend import SentenceTransformerBackend + from sentence_transformers import SentenceTransformer + embedding_model = SentenceTransformer("distilbert-base-nli-stsb-mean-tokens") + sentence_model = SentenceTransformerBackend(embedding_model) + ``` + """ + def __init__(self, embedding_model: Union[str, SentenceTransformer]): + super().__init__() + + if isinstance(embedding_model, SentenceTransformer): + self.embedding_model = embedding_model + elif isinstance(embedding_model, str): + self.embedding_model = SentenceTransformer(embedding_model) + else: + raise ValueError("Please select a correct SentenceTransformers model: \n" + "`from sentence_transformers import SentenceTransformer` \n" + "`model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')`") + + def embed(self, + documents: List[str], + verbose: bool = False) -> np.ndarray: + """ Embed a list of n documents/words into an n-dimensional + matrix of embeddings + Arguments: + documents: A list of documents or words to be embedded + verbose: Controls the verbosity of the process + Returns: + Document/words embeddings with shape (n, m) with `n` documents/words + that each have an embeddings size of `m` + """ + embeddings = self.embedding_model.encode(documents, show_progress_bar=verbose) + return embeddings
\ No newline at end of file diff --git a/keybert/backend/_spacy.py b/keybert/backend/_spacy.py new file mode 100644 index 0000000..8a09f17 --- /dev/null +++ b/keybert/backend/_spacy.py @@ -0,0 +1,99 @@ +import numpy as np +from tqdm import tqdm +from typing import List +from keybert.backend import BaseEmbedder + + +class SpacyBackend(BaseEmbedder): + """ Spacy embedding model + + The Spacy embedding model used for generating document and + word embeddings. + + Arguments: + embedding_model: A spacy embedding model + + Usage: + + To create a Spacy backend, you need to create an nlp object and + pass it through this backend: + + ```python + import spacy + from keybert.backend import SpacyBackend + + nlp = spacy.load("en_core_web_md", exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']) + spacy_model = SpacyBackend(nlp) + ``` + + To load in a transformer model use the following: + + ```python + import spacy + from thinc.api import set_gpu_allocator, require_gpu + from keybert.backend import SpacyBackend + + nlp = spacy.load("en_core_web_trf", exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']) + set_gpu_allocator("pytorch") + require_gpu(0) + spacy_model = SpacyBackend(nlp) + ``` + + If you run into gpu/memory-issues, please use: + + ```python + import spacy + from keybert.backend import SpacyBackend + + spacy.prefer_gpu() + nlp = spacy.load("en_core_web_trf", exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']) + spacy_model = SpacyBackend(nlp) + ``` + """ + def __init__(self, embedding_model): + super().__init__() + + if "spacy" in str(type(embedding_model)): + self.embedding_model = embedding_model + else: + raise ValueError("Please select a correct Spacy model by either using a string such as 'en_core_web_md' " + "or create a nlp model using: `nlp = spacy.load('en_core_web_md')") + + def embed(self, + documents: List[str], + verbose: bool = False) -> np.ndarray: + """ Embed a list of n documents/words into an n-dimensional + matrix of embeddings + + Arguments: + documents: A list of documents or words to be embedded + verbose: Controls the verbosity of the process + + Returns: + Document/words embeddings with shape (n, m) with `n` documents/words + that each have an embeddings size of `m` + """ + + # Extract embeddings from a transformer model + if "transformer" in self.embedding_model.component_names: + embeddings = [] + for doc in tqdm(documents, position=0, leave=True, disable=not verbose): + try: + embedding = self.embedding_model(doc)._.trf_data.tensors[-1][0].tolist() + except: + embedding = self.embedding_model("An empty document")._.trf_data.tensors[-1][0].tolist() + embeddings.append(embedding) + embeddings = np.array(embeddings) + + # Extract embeddings from a general spacy model + else: + embeddings = [] + for doc in tqdm(documents, position=0, leave=True, disable=not verbose): + try: + vector = self.embedding_model(doc).vector + except ValueError: + vector = self.embedding_model("An empty document").vector + embeddings.append(vector) + embeddings = np.array(embeddings) + + return embeddings diff --git a/keybert/backend/_use.py b/keybert/backend/_use.py new file mode 100644 index 0000000..a39ccde --- /dev/null +++ b/keybert/backend/_use.py @@ -0,0 +1,54 @@ +import numpy as np +from tqdm import tqdm +from typing import List + +from keybert.backend import BaseEmbedder + + +class USEBackend(BaseEmbedder): + """ Universal Sentence Encoder + + USE encodes text into high-dimensional vectors that + are used for semantic similarity in KeyBERT. + + Arguments: + embedding_model: An USE embedding model + + Usage: + + ```python + import tensorflow_hub + from keybert.backend import USEBackend + + embedding_model = tensorflow_hub.load("https://tfhub.dev/google/universal-sentence-encoder/4") + use_embedder = USEBackend(embedding_model) + ``` + """ + def __init__(self, embedding_model): + super().__init__() + + try: + embedding_model(["test sentence"]) + self.embedding_model = embedding_model + except TypeError: + raise ValueError("Please select a correct USE model: \n" + "`import tensorflow_hub` \n" + "`embedding_model = tensorflow_hub.load(path_to_model)`") + + def embed(self, + documents: List[str], + verbose: bool = False) -> np.ndarray: + """ Embed a list of n documents/words into an n-dimensional + matrix of embeddings + + Arguments: + documents: A list of documents or words to be embedded + verbose: Controls the verbosity of the process + + Returns: + Document/words embeddings with shape (n, m) with `n` documents/words + that each have an embeddings size of `m` + """ + embeddings = np.array([self.embedding_model([doc]).cpu().numpy()[0] + for doc in tqdm(documents, disable=not verbose)]) + return embeddings diff --git a/keybert/backend/_utils.py b/keybert/backend/_utils.py new file mode 100644 index 0000000..0e89bf1 --- /dev/null +++ b/keybert/backend/_utils.py @@ -0,0 +1,44 @@ +from ._base import BaseEmbedder +from ._sentencetransformers import SentenceTransformerBackend + + +def select_backend(embedding_model) -> BaseEmbedder: + """ Select an embedding model based on language or a specific sentence transformer models. + When selecting a language, we choose distilbert-base-nli-stsb-mean-tokens for English and + xlm-r-bert-base-nli-stsb-mean-tokens for all other languages as it support 100+ languages. + Returns: + model: Either a Sentence-Transformer or Flair model + """ + # keybert language backend + if isinstance(embedding_model, BaseEmbedder): + return embedding_model + + # Flair word embeddings + if "flair" in str(type(embedding_model)): + from keybert.backend._flair import FlairBackend + return FlairBackend(embedding_model) + + # Spacy embeddings + if "spacy" in str(type(embedding_model)): + from keybert.backend._spacy import SpacyBackend + return SpacyBackend(embedding_model) + + # Gensim embeddings + if "gensim" in str(type(embedding_model)): + from keybert.backend._gensim import GensimBackend + return GensimBackend(embedding_model) + + # USE embeddings + if "tensorflow" and "saved_model" in str(type(embedding_model)): + from keybert.backend._use import USEBackend + return USEBackend(embedding_model) + + # Sentence Transformer embeddings + if "sentence_transformers" in str(type(embedding_model)): + return SentenceTransformerBackend(embedding_model) + + # Create a Sentence Transformer model based on a string + if isinstance(embedding_model, str): + return SentenceTransformerBackend(embedding_model) + + return SentenceTransformerBackend("xlm-r-bert-base-nli-stsb-mean-tokens")
\ No newline at end of file diff --git a/keybert/model.py b/keybert/model.py index 4653123..586d216 100644 --- a/keybert/model.py +++ b/keybert/model.py @@ -4,21 +4,13 @@ warnings.filterwarnings("ignore", category=FutureWarning) import numpy as np from tqdm import tqdm from typing import List, Union, Tuple -from sentence_transformers import SentenceTransformer from sklearn.metrics.pairwise import cosine_similarity from sklearn.feature_extraction.text import CountVectorizer -from .mmr import mmr -from .maxsum import max_sum_similarity - -# Flair -try: - from flair.embeddings import DocumentEmbeddings, TokenEmbeddings, DocumentPoolEmbeddings - from flair.data import Sentence - _HAS_FLAIR = True -except ModuleNotFoundError as e: - DocumentEmbeddings, TokenEmbeddings, DocumentPoolEmbeddings = None, None, None - _HAS_FLAIR = False +# KeyBERT +from keybert.mmr import mmr +from keybert.maxsum import max_sum_similarity +from keybert.backend._utils import select_backend class KeyBERT: @@ -38,23 +30,26 @@ class KeyBERT: """ def __init__(self, - model: Union[str, - SentenceTransformer, - DocumentEmbeddings, - TokenEmbeddings] = 'distilbert-base-nli-mean-tokens'): + model='distilbert-base-nli-mean-tokens'): """ KeyBERT initialization Arguments: - model: Use a custom embedding model. You can pass in a string related - to one of the following models: - https://www.sbert.net/docs/pretrained_models.html - You can also pass in a SentenceTransformer() model or a Flair - DocumentEmbedding model. + model: Use a custom embedding model. + The following backends are currently supported + * SentenceTransformers + * Flair + * Spacy + * Gensim + * USE (TF-Hub) + You can also pass in a string that points to one of the following + sentence-transformers models: + * https://www.sbert.net/docs/pretrained_models.html """ - self.model = self._select_embedding_model(model) + self.model = select_backend(model) def extract_keywords(self, docs: Union[str, List[str]], + candidates: List[str] = None, keyphrase_ngram_range: Tuple[int, int] = (1, 1), stop_words: Union[str, List[str]] = 'english', top_n: int = 5, @@ -84,6 +79,7 @@ class KeyBERT: Arguments: docs: The document(s) for which to extract keywords/keyphrases + candidates: Candidate keywords/keyphrases to use instead of extracting them from the document(s) keyphrase_ngram_range: Length, in words, of the extracted keywords/keyphrases stop_words: Stopwords to remove from the document top_n: Return the top n keywords/keyphrases @@ -106,15 +102,16 @@ class KeyBERT: """ if isinstance(docs, str): - return self._extract_keywords_single_doc(docs, - keyphrase_ngram_range, - stop_words, - top_n, - use_maxsum, - use_mmr, - diversity, - nr_candidates, - vectorizer) + return self._extract_keywords_single_doc(doc=docs, + candidates=candidates, + keyphrase_ngram_range=keyphrase_ngram_range, + stop_words=stop_words, + top_n=top_n, + use_maxsum=use_maxsum, + use_mmr=use_mmr, + diversity=diversity, + nr_candidates=nr_candidates, + vectorizer=vectorizer) elif isinstance(docs, list): warnings.warn("Although extracting keywords for multiple documents is faster " "than iterating over single documents, it requires significantly more memory " @@ -128,6 +125,7 @@ class KeyBERT: def _extract_keywords_single_doc(self, doc: str, + candidates: List[str] = None, keyphrase_ngram_range: Tuple[int, int] = (1, 1), stop_words: Union[str, List[str]] = 'english', top_n: int = 5, @@ -140,6 +138,7 @@ class KeyBERT: Arguments: doc: The document for which to extract keywords/keyphrases + candidates: Candidate keywords/keyphrases to use instead of extracting them from the document(s) keyphrase_ngram_range: Length, in words, of the extracted keywords/keyphrases stop_words: Stopwords to remove from the document top_n: Return the top n keywords/keyphrases @@ -152,30 +151,28 @@ class KeyBERT: Returns: keywords: the top n keywords for a document with their respective distances to the input document - """ try: # Extract Words - if vectorizer: - count = vectorizer.fit([doc]) - else: - count = CountVectorizer(ngram_range=keyphrase_ngram_range, stop_words=stop_words).fit([doc]) - words = count.get_feature_names() + if candidates is None: + if vectorizer: + count = vectorizer.fit([doc]) + else: + count = CountVectorizer(ngram_range=keyphrase_ngram_range, stop_words=stop_words).fit([doc]) + candidates = count.get_feature_names() # Extract Embeddings - doc_embedding = self._extract_embeddings([doc]) - word_embeddings = self._extract_embeddings(words) - # doc_embedding = self.model.encode([doc]) - # word_embeddings = self.model.encode(words) + doc_embedding = self.model.embed([doc]) + candidate_embeddings = self.model.embed(candidates) # Calculate distances and extract keywords if use_mmr: - keywords = mmr(doc_embedding, word_embeddings, words, top_n, diversity) + keywords = mmr(doc_embedding, candidate_embeddings, candidates, top_n, diversity) elif use_maxsum: - keywords = max_sum_similarity(doc_embedding, word_embeddings, words, top_n, nr_candidates) + keywords = max_sum_similarity(doc_embedding, candidate_embeddings, candidates, top_n, nr_candidates) else: - distances = cosine_similarity(doc_embedding, word_embeddings) - keywords = [(words[index], round(float(distances[0][index]), 4)) + distances = cosine_similarity(doc_embedding, candidate_embeddings) + keywords = [(candidates[index], round(float(distances[0][index]), 4)) for index in distances.argsort()[0][-top_n:]][::-1] return keywords @@ -191,7 +188,8 @@ class KeyBERT: vectorizer: CountVectorizer = None) -> List[List[Tuple[str, float]]]: """ Extract keywords/keyphrases for a multiple documents - This currently does not use MMR as + This currently does not use MMR and Max Sum Similarity as it cannot + process these methods in bulk. Arguments: docs: The document for which to extract keywords/keyphrases @@ -204,7 +202,6 @@ class KeyBERT: Returns: keywords: the top n keywords for a document with their respective distances to the input document - """ # Extract words if vectorizer: @@ -215,10 +212,8 @@ class KeyBERT: df = count.transform(docs) # Extract embeddings - word_embeddings = self._extract_embeddings(words) - doc_embeddings = self._extract_embeddings(docs) - # word_embeddings = self.model.encode(words, show_progress_bar=True) - # doc_embeddings = self.model.encode(docs, show_progress_bar=True) + doc_embeddings = self.model.embed(docs) + word_embeddings = self.model.embed(words) # Extract keywords keywords = [] @@ -234,82 +229,3 @@ class KeyBERT: keywords.append(["None Found"]) return keywords - - def _extract_embeddings(self, documents: Union[List[str], str]) -> np.ndarray: - """ Extract sentence/document embeddings through pre-trained embeddings - - For an overview of pre-trained models: https://www.sbert.net/docs/pretrained_models.html - - Arguments: - documents: Dataframe with documents and their corresponding IDs - - Returns: - embeddings: The extracted embeddings using the sentence transformer - module. Typically uses pre-trained huggingface models. - """ - if isinstance(documents, str): - documents = [documents] - - # Infer embeddings with SentenceTransformer - if isinstance(self.model, SentenceTransformer): - embeddings = self.model.encode(documents) - - # Infer embeddings with Flair - elif isinstance(self.model, DocumentEmbeddings): - embeddings = [] - for index, document in enumerate(documents): - try: - sentence = Sentence(document) if document else Sentence("an empty document") - self.model.embed(sentence) - except RuntimeError: - sentence = Sentence("an empty document") - self.model.embed(sentence) - embedding = sentence.embedding.detach().cpu().numpy() - embeddings.append(embedding) - embeddings = np.asarray(embeddings) - - else: - raise ValueError("An incorrect embedding model type was selected.") - - return embeddings - - def _select_embedding_model(self, model: Union[str, - SentenceTransformer, - DocumentEmbeddings, - TokenEmbeddings]) -> Union[SentenceTransformer, - DocumentEmbeddings]: - """ Select an embedding model based on language or a specific sentence transformer models. - When selecting a language, we choose distilbert-base-nli-stsb-mean-tokens for English and - xlm-r-bert-base-nli-stsb-mean-tokens for all other languages as it support 100+ languages. - - Arguments: - model: Use a custom embedding model. You can pass in a string related - to one of the following models: - https://www.sbert.net/docs/pretrained_models.html - You can also pass in a SentenceTransformer() model or a Flair - DocumentEmbedding model. - - Returns: - model: Either a Sentence-Transformer or Flair model - """ - - # Sentence Transformer embeddings - if isinstance(model, SentenceTransformer): - return model - - # Flair word embeddings - elif _HAS_FLAIR and isinstance(model, TokenEmbeddings): - return DocumentPoolEmbeddings([model]) - - # Flair document embeddings + disable fine tune to prevent CUDA OOM - # https://github.com/flairNLP/flair/issues/1719 - elif _HAS_FLAIR and isinstance(model, DocumentEmbeddings): - if "fine_tune" in model.__dict__: - model.fine_tune = False - return model - - # Select embedding model based on specific sentence transformer model - elif isinstance(model, str): - return SentenceTransformer(model) - - return SentenceTransformer("xlm-r-bert-base-nli-stsb-mean-tokens") @@ -9,11 +9,12 @@ nav: - Home: index.md - Guides: - Quickstart: guides/quickstart.md - - Embeddings: guides/embeddings.md + - Embedding Models: guides/embeddings.md - API: - KeyBERT: api/keybert.md - MMR: api/mmr.md - MaxSum: api/maxsum.md + - Changelog: changelog.md plugins: - mkdocstrings: watch: @@ -36,11 +37,7 @@ theme: primary: black accent: blue markdown_extensions: - - codehilite - - pymdownx.inlinehilite - - pymdownx.details - - pymdownx.tabbed - pymdownx.highlight: - use_pygments: true + - pymdownx.superfences: - toc: permalink: true @@ -1,4 +1,4 @@ -import setuptools +from setuptools import setup, find_packages test_packages = [ "pytest>=5.4.3", @@ -18,20 +18,35 @@ docs_packages = [ ] flair_packages = [ + "torch>=1.4.0,<1.7.1", "flair==0.7" ] -extra_package = flair_packages +spacy_packages = [ + "spacy>=3.0.1" +] + +use_packages = [ + "tensorflow", + "tensorflow_hub", + "tensorflow_text" +] + +gensim_packages = [ + "gensim>=3.6.0" +] + +extra_packages = flair_packages + spacy_packages + use_packages + gensim_packages dev_packages = docs_packages + test_packages -with open("README.md", "r") as fh: +with open("README.md", "r", encoding='utf-8') as fh: long_description = fh.read() -setuptools.setup( +setup( name="keybert", - packages=["keybert"], - version="0.2.0", + packages=find_packages(exclude=["notebooks", "docs"]), + version="0.3.0", author="Maarten Grootendorst", author_email="maartengrootendorst@gmail.com", description="KeyBERT performs keyword extraction with state-of-the-art transformer models.", @@ -60,7 +75,7 @@ setuptools.setup( "docs": docs_packages, "dev": dev_packages, "flair": flair_packages, - "all": extra_package + "all": extra_packages }, python_requires='>=3.6', )
\ No newline at end of file |