Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/MaartenGr/KeyBERT.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'keybert/model.py')
-rw-r--r--keybert/model.py29
1 files changed, 24 insertions, 5 deletions
diff --git a/keybert/model.py b/keybert/model.py
index 1b8f7ac..0910c2b 100644
--- a/keybert/model.py
+++ b/keybert/model.py
@@ -8,6 +8,25 @@ import warnings
class KeyBERT:
+ """
+ A minimal method for keyword extraction with BERT
+
+ The keyword extraction is done by finding the sub-phrases in
+ a document that are the most similar to the document itself.
+
+ First, document embeddings are extracted with BERT to get a
+ document-level representation. Then, word embeddings are extracted
+ for N-gram words/phrases. Finally, we use cosine similarity to find the
+ words/phrases that are the most similar to the document.
+
+ The most similar words could then be identified as the words that
+ best describe the entire document.
+
+ Arguments:
+ model: the name of the model used by sentence-transformer
+ for a full overview see https://www.sbert.net/docs/pretrained_models.html
+
+ """
def __init__(self, model: str = 'distilbert-base-nli-mean-tokens'):
self.model = SentenceTransformer(model)
self.doc_embeddings = None
@@ -20,10 +39,10 @@ class KeyBERT:
min_df: int = 1) -> Union[List[str], List[List[str]]]:
""" Extract keywords/keyphrases
- NOTE: I would advise you to use
-
- Single Document:
-
+ NOTE:
+ I would advise you to iterate over single documents as they
+ will need the least amount of memory. Even though this is slower,
+ you are not likely to run into memory errors.
Multiple Documents:
There is an option to extract keywords for multiple documents
@@ -44,7 +63,7 @@ class KeyBERT:
if keywords for multiple documents need to be extracted
Returns:
- keywords: The top n keywords for a document
+ keywords: the top n keywords for a document
"""