diff mbox series

[bug#42117,12/17] gnu: Add r-tokenizers.

Message ID 20200629055042.8565-12-peterloleungyau@gmail.com
State Accepted
Headers show
Series [bug#42117,01/17] gnu: Add r-hardhat. | expand

Checks

Context Check Description
cbaines/applying patch fail View Laminar job

Commit Message

Peter Lo June 29, 2020, 5:50 a.m. UTC
* gnu/packages/cran.scm (r-tokenizers): New variable.
---
 gnu/packages/cran.scm | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)
diff mbox series

Patch

diff --git a/gnu/packages/cran.scm b/gnu/packages/cran.scm
index 0dcf8d20f3..26c3c1e562 100644
--- a/gnu/packages/cran.scm
+++ b/gnu/packages/cran.scm
@@ -22670,3 +22670,37 @@  analysis.  These novels are \"Sense and Sensibility\", \"Pride and
 Prejudice\", \"Mansfield Park\", \"Emma\", \"Northanger Abbey\", and
 \"Persuasion\".")
     (license license:expat)))
+
+(define-public r-tokenizers
+  (package
+    (name "r-tokenizers")
+    (version "0.2.1")
+    (source
+      (origin
+        (method url-fetch)
+        (uri (cran-uri "tokenizers" version))
+        (sha256
+          (base32
+            "006xf1vdrmp9skhpss9ldhmk4cwqk512cjp1pxm2gxfybpf7qq98"))))
+    (properties `((upstream-name . "tokenizers")))
+    (build-system r-build-system)
+    (propagated-inputs
+      `(("r-rcpp" ,r-rcpp)
+        ("r-snowballc" ,r-snowballc)
+        ("r-stringi" ,r-stringi)))
+    (native-inputs `(("r-knitr" ,r-knitr)))
+    (home-page
+      "https://lincolnmullen.com/software/tokenizers/")
+    (synopsis
+      "Fast, Consistent Tokenization of Natural Language Text")
+    (description
+      "Convert natural language text into tokens.  Includes tokenizers
+for shingled n-grams, skip n-grams, words, word stems, sentences,
+paragraphs, characters, shingled characters, lines, tweets, Penn
+Treebank, regular expressions, as well as functions for counting
+characters, words, and sentences, and a function for splitting longer
+texts into separate documents, each with the same number of words.
+The tokenizers have a consistent interface, and the package is built
+on the @code{stringi} and @code{Rcpp} packages for fast yet correct
+tokenization in 'UTF-8'.")
+    (license license:expat)))