diff mbox series

[bug#73106,08/10] gnu: Add rust-tokenizers.

Message ID 20240907165626.22651-8-ngraves@ngraves.fr
State New
Headers show
Series None | expand

Commit Message

Nicolas Graves Sept. 7, 2024, 4:56 p.m. UTC
* gnu/packages/machine-learning.scm (rust-tokenizers): New variable.

Change-Id: I3189a2d826f072f65ad053d77eb39be39775f1c2
---
 gnu/packages/machine-learning.scm | 60 +++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)
diff mbox series

Patch

diff --git a/gnu/packages/machine-learning.scm b/gnu/packages/machine-learning.scm
index 27d7f0526b..3b601f6c91 100644
--- a/gnu/packages/machine-learning.scm
+++ b/gnu/packages/machine-learning.scm
@@ -5675,6 +5675,66 @@  (define-public rust-hf-hub-0.3
 python package, but only implements a smaller subset of functions.")
     (license license:asl2.0)))
 
+(define-public rust-tokenizers
+  (package
+    (name "rust-tokenizers")
+    (version "0.19.1")
+    (source
+     (origin
+       (method url-fetch)
+       (uri (crate-uri "tokenizers" version))
+       (file-name (string-append name "-" version ".tar.gz"))
+       (sha256
+        (base32 "1zg6ffpllygijb5bh227m9p4lrhf0pjkysky68kddwrsvp8zl075"))
+       (modules '((guix build utils)))
+       (snippet
+        #~(substitute* "Cargo.toml"
+            (("0.1.12") ; rust-monostate requires a rust-syn-2 update
+             "0.1.11")
+            (("version = \"6.4\"")  ; rust-onig
+             "version = \"6.1.1\"")))))
+    (build-system cargo-build-system)
+    (arguments
+     (list
+      #:tests? #f  ; tests are relying on missing data.
+      #:cargo-inputs
+      `(("rust-aho-corasick" ,rust-aho-corasick-1)
+        ("rust-derive-builder" ,rust-derive-builder-0.20)
+        ("rust-esaxx-rs" ,rust-esaxx-rs-0.1)
+        ("rust-fancy-regex" ,rust-fancy-regex-0.13)
+        ("rust-getrandom" ,rust-getrandom-0.2)
+        ("rust-hf-hub" ,rust-hf-hub-0.3)
+        ("rust-indicatif" ,rust-indicatif-0.17)
+        ("rust-itertools" ,rust-itertools-0.12)
+        ("rust-lazy-static" ,rust-lazy-static-1)
+        ("rust-log" ,rust-log-0.4)
+        ("rust-macro-rules-attribute" ,rust-macro-rules-attribute-0.2)
+        ("rust-monostate" ,rust-monostate-0.1)
+        ("rust-onig" ,rust-onig-6)
+        ("rust-paste" ,rust-paste-1)
+        ("rust-rand" ,rust-rand-0.8)
+        ("rust-rayon" ,rust-rayon-1)
+        ("rust-rayon-cond" ,rust-rayon-cond-0.3)
+        ("rust-regex" ,rust-regex-1)
+        ("rust-regex-syntax" ,rust-regex-syntax-0.8)
+        ("rust-serde" ,rust-serde-1)
+        ("rust-serde-json" ,rust-serde-json-1)
+        ("rust-spm-precompiled" ,rust-spm-precompiled-0.1)
+        ("rust-thiserror" ,rust-thiserror-1)
+        ("rust-unicode-normalization-alignments" ,rust-unicode-normalization-alignments-0.1)
+        ("rust-unicode-segmentation" ,rust-unicode-segmentation-1)
+        ("rust-unicode-categories" ,rust-unicode-categories-0.1))
+      #:cargo-development-inputs
+      `(("rust-assert-approx-eq" ,rust-assert-approx-eq-1)
+        ("rust-criterion" ,rust-criterion-0.5)
+        ("rust-tempfile" ,rust-tempfile-3))))
+    (home-page "https://github.com/huggingface/tokenizers")
+    (synopsis "Implementation of various popular tokenizers")
+    (description
+     "This package provides a Rust implementation of today's most used
+tokenizers, with a focus on performances and versatility.")
+    (license license:asl2.0)))
+
 (define-public python-hmmlearn
   (package
     (name "python-hmmlearn")