diff mbox series

[bug#73266,6/9] gnu: Add python-cutlery.

Message ID 20240915085720.13323-6-ngraves@ngraves.fr
State New
Headers show
Series [bug#73266,1/9] gnu: Add python-azure-storage-file-datalake. | expand

Commit Message

Nicolas Graves Sept. 15, 2024, 8:57 a.m. UTC
* gnu/packages/machine-learning.scm (python-cutlery): New variable.

Change-Id: I5304205737330850ce84a49df814b96a4d605699
---
 gnu/packages/machine-learning.scm | 38 +++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)
diff mbox series

Patch

diff --git a/gnu/packages/machine-learning.scm b/gnu/packages/machine-learning.scm
index 008bf2060a..89fcd3c1b7 100644
--- a/gnu/packages/machine-learning.scm
+++ b/gnu/packages/machine-learning.scm
@@ -2442,6 +2442,44 @@  (define-public python-cmaes
 Covariance Matrix Adaptation Evolution Strategy (CMA-ES) for Python.")
     (license license:expat)))
 
+(define-public python-cutlery
+  (package
+    (name "python-cutlery")
+    (version "0.0.6")
+    (source
+     (origin
+       (method url-fetch)
+       (uri (pypi-uri "cutlery" version))
+       (sha256
+        (base32 "1l5jv0mkmvzlmglz61py6f4inil2iwgh1ap8881cyb6k7hnnccc9"))))
+    (build-system pyproject-build-system)
+    (arguments
+     (list
+      #:phases
+      #~(modify-phases %standard-phases
+          ;; For some reason when both local and installed exist,
+          ;; local is chosen and is missing shared libraries.
+          ;; Use installed version to run tests instead.
+          (add-before 'check 'pre-check
+            (lambda* (#:key tests? inputs outputs #:allow-other-keys)
+              (when tests?
+                (copy-recursively "cutlery/tests" "tests")
+                (delete-file-recursively "cutlery")
+                (add-installed-pythonpath inputs outputs)))))))
+    (propagated-inputs (list python-regex))
+    (native-inputs (list python-cython python-pytest))
+    (home-page "https://github.com/explosion/curated-tokenizers")
+    (synopsis "Lightweight piece tokenization library")
+    (description "This package provides a lightweight wordpiece and
+sentencepiece tokenization library.  It supports multiple tokenizers:
+@itemize
+@item BPE
+@item Byte BPE
+@item Unigram
+@item Wordpiece
+@end itemize")
+    (license license:expat)))
+
 (define-public python-autograd
   (let* ((commit "c6d81ce7eede6db801d4e9a92b27ec5d409d0eab")
          (revision "0")