diff mbox series

[bug#73115] gnu: Add python-sentence-transformers.

Message ID 20240908000927.29091-1-ngraves@ngraves.fr
State New
Headers show
Series [bug#73115] gnu: Add python-sentence-transformers. | expand

Commit Message

Nicolas Graves Sept. 8, 2024, 12:09 a.m. UTC
* gnu/packages/machine-learning.scm (python-sentence-transformers): New variable.

Change-Id: Iedab56f6c2bdde12e654ba67695cd996122bdb0b
---
 gnu/packages/machine-learning.scm | 54 +++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)
diff mbox series

Patch

diff --git a/gnu/packages/machine-learning.scm b/gnu/packages/machine-learning.scm
index 42842d7d61..b2da07e8f0 100644
--- a/gnu/packages/machine-learning.scm
+++ b/gnu/packages/machine-learning.scm
@@ -1239,6 +1239,60 @@  (define-public python-sentencepiece
 unsupervised text tokenizer.")
     (license license:asl2.0)))
 
+(define-public python-sentence-transformers
+  (package
+    (name "python-sentence-transformers")
+    (version "3.0.1")
+    (source
+     (origin
+       (method url-fetch)
+       (uri (pypi-uri "sentence_transformers" version))
+       (sha256
+        (base32 "1xmzbyrlp6wa7adf42n67c544db17nz95b10ri603lf4gi9jqgca"))))
+    (build-system pyproject-build-system)
+    (arguments
+     (list
+      #:test-flags `(list
+                     ;; Missing fixture / train or test data.
+                     ;; Requires internet access.
+                     "--ignore=tests/test_sentence_transformer.py"
+                     "--ignore=tests/test_train_stsb.py"
+                     "--ignore=tests/test_compute_embeddings.py"
+                     "--ignore=tests/test_cross_encoder.py"
+                     "--ignore=tests/test_model_card_data.py"
+                     "--ignore=tests/test_multi_process.py"
+                     "--ignore=tests/test_pretrained_stsb.py"
+                     "-k" ,(string-append
+                            "not test_LabelAccuracyEvaluator"
+                            " and not test_ParaphraseMiningEvaluator"
+                            " and not test_cmnrl_same_grad"
+                            " and not test_paraphrase_mining"
+                            " and not test_simple_encode"))))
+    (propagated-inputs (list python-huggingface-hub
+                             python-numpy
+                             python-pillow
+                             python-scikit-learn
+                             python-scipy
+                             python-pytorch
+                             python-tqdm
+                             python-transformers))
+    (native-inputs (list python-pytest))
+    (home-page "https://www.SBERT.net")
+    (synopsis "Multilingual text embeddings")
+    (description "This framework provides an easy method to compute dense
+vector representations for sentences, paragraphs, and images.  The models are
+based on transformer networks like BERT / RoBERTa / XLM-RoBERTa and achieve
+state-of-the-art performance in various tasks. Text is embedded in vector
+space such that similar text are closer and can efficiently be found using
+cosine similarity.
+
+This package provides easy access to pretrained models for more than 100
+languages, fine-tuned for various use-cases.
+
+Further, this framework allows an easy fine-tuning of custom embeddings
+models, to achieve maximal performance on your specific task.")
+    (license license:asl2.0)))
+
 (define-public python-spacy-legacy
   (package
     (name "python-spacy-legacy")