diff mbox series

[bug#39258,4/4] gnu: Use xapian index for package search.

Message ID 20200227204150.30985-5-arunisaac@systemreboot.net
State Work in progress
Headers show
Series Xapian for Guix package search | expand

Checks

Context Check Description
cbaines/comparison success View comparision
cbaines/git branch success View Git branch
cbaines/applying patch success View Laminar job

Commit Message

Arun Isaac Feb. 27, 2020, 8:41 p.m. UTC
* gnu/packages.scm (search-package-index): New function.
* guix/scripts/package.scm (find-packages-by-description): Search using the
xapian package index if search patterns are literal strings. Else, search
using fold-packages.
---
 gnu/packages.scm         | 17 +++++++++++-
 guix/scripts/package.scm | 57 +++++++++++++++++++++++-----------------
 2 files changed, 49 insertions(+), 25 deletions(-)

Comments

Pierre Neidhardt Feb. 28, 2020, 8:11 a.m. UTC | #1
Arun Isaac <arunisaac@systemreboot.net> writes:

> @@ -453,6 +454,20 @@ reducing the memory footprint."
>  
>    db-path)
>  
> +(define (search-package-index profile querystring)

Maybe `query-string'?

>  
> --- a/guix/scripts/package.scm
> +++ b/guix/scripts/package.scm
> @@ -7,6 +7,7 @@
>  ;;; Copyright © 2016 Benz Schenk <benz.schenk@uzh.ch>
>  ;;; Copyright © 2016 Chris Marusich <cmmarusich@gmail.com>
>  ;;; Copyright © 2019 Tobias Geerinckx-Rice <me@tobias.gr>
> +;;; Copyright © 2020 Arun Isaac <arunisaac@systemreboot.net>
>  ;;;
>  ;;; This file is part of GNU Guix.
>  ;;;
> @@ -178,31 +179,40 @@ hooks\" run when building the profile."
>  ;;; Package specifications.
>  ;;;
>  
> -(define (find-packages-by-description regexps)
> +(define (find-packages-by-description patterns)
>    "Return a list of pairs: packages whose name, synopsis, description,
>  or output matches at least one of REGEXPS sorted by relevance, and its
>  non-zero relevance score."

Need to update the docstring.

> -  (let ((matches (fold-packages (lambda (package result)
> -                                  (if (package-superseded package)
> -                                      result
> -                                      (match (package-relevance package
> -                                                                regexps)
> -                                        ((? zero?)
> -                                         result)
> -                                        (score
> -                                         (cons (cons package score)
> -                                               result)))))
> -                                '())))
> -    (sort matches
> -          (lambda (m1 m2)
> -            (match m1
> -              ((package1 . score1)
> -               (match m2
> -                 ((package2 . score2)
> -                  (if (= score1 score2)
> -                      (string>? (package-full-name package1)
> -                                (package-full-name package2))
> -                      (> score1 score2))))))))))
> +  (define (regexp? str)
> +    (string-any
> +     (char-set #\. #\[ #\{ #\} #\( #\) #\\ #\* #\+ #\? #\| #\^ #\$)
> +     str))
> +
> +  (if (and (current-profile)
> +           (not (any regexp? patterns)))

I would not put characters like ".", "$", or "+" here, lest we mistake a
Xapian pattern for a regexp.

As you said, I don't think both are compatible without ambiguity
anyways, so we should probably drop regexp (or at least toggle them with
a command line argument).
Simon Tournier March 3, 2020, 7:21 p.m. UTC | #2
Hi Arun,


On Thu, 27 Feb 2020 at 21:42, Arun Isaac <arunisaac@systemreboot.net> wrote:
>
> * gnu/packages.scm (search-package-index): New function.
> * guix/scripts/package.scm (find-packages-by-description): Search using the
> xapian package index if search patterns are literal strings. Else, search
> using fold-packages.
> ---
>  gnu/packages.scm         | 17 +++++++++++-
>  guix/scripts/package.scm | 57 +++++++++++++++++++++++-----------------
>  2 files changed, 49 insertions(+), 25 deletions(-)
>
> diff --git a/gnu/packages.scm b/gnu/packages.scm
> index e91753e2a8..5b5b29bf84 100644
> --- a/gnu/packages.scm
> +++ b/gnu/packages.scm
> @@ -67,7 +67,8 @@
>              specifications->manifest
>
>              generate-package-cache
> -            generate-package-search-index))
> +            generate-package-search-index
> +            search-package-index))
>
>  ;;; Commentary:
>  ;;;
> @@ -453,6 +454,20 @@ reducing the memory footprint."
>
>    db-path)
>
> +(define (search-package-index profile querystring)
> +  (let ((offset 0)
> +        (pagesize 10))

Why this value of 10?
This fix the number of packages returned. Hum?
I have tried to replace by 100 and I got 100 packages. :-)


> +    (call-with-database (string-append profile %package-search-index)
> +      (lambda (db)
> +        (let ((query (parse-query querystring #:stemmer (make-stem "en"))))
> +          (mset-fold (lambda (item result)

I do not know what is the convention for the bindings.
But there is 'fold-packages' so I would be inclined to 'fold-msets' or
something in this flavour.


> +                       (match (find-packages-by-name
> +                               (document-data (mset-item-document item)))
> +                         ((package _ ...)
> +                          (append result `((,package . ,(mset-item-weight item)))))))
> +                     '()
> +                     (enquire-mset (enquire db query) offset pagesize)))))))
> +
>
>  (define %sigint-prompt
>    ;; The prompt to jump to upon SIGINT.
> diff --git a/guix/scripts/package.scm b/guix/scripts/package.scm
> index 1cb0d382bf..6a3b9002dd 100644
> --- a/guix/scripts/package.scm
> +++ b/guix/scripts/package.scm
> @@ -7,6 +7,7 @@
>  ;;; Copyright © 2016 Benz Schenk <benz.schenk@uzh.ch>
>  ;;; Copyright © 2016 Chris Marusich <cmmarusich@gmail.com>
>  ;;; Copyright © 2019 Tobias Geerinckx-Rice <me@tobias.gr>
> +;;; Copyright © 2020 Arun Isaac <arunisaac@systemreboot.net>
>  ;;;
>  ;;; This file is part of GNU Guix.
>  ;;;
> @@ -178,31 +179,40 @@ hooks\" run when building the profile."
>  ;;; Package specifications.
>  ;;;
>
> -(define (find-packages-by-description regexps)
> +(define (find-packages-by-description patterns)
>    "Return a list of pairs: packages whose name, synopsis, description,
>  or output matches at least one of REGEXPS sorted by relevance, and its
>  non-zero relevance score."
> -  (let ((matches (fold-packages (lambda (package result)
> -                                  (if (package-superseded package)
> -                                      result
> -                                      (match (package-relevance package
> -                                                                regexps)
> -                                        ((? zero?)
> -                                         result)
> -                                        (score
> -                                         (cons (cons package score)
> -                                               result)))))
> -                                '())))
> -    (sort matches
> -          (lambda (m1 m2)
> -            (match m1
> -              ((package1 . score1)
> -               (match m2
> -                 ((package2 . score2)
> -                  (if (= score1 score2)
> -                      (string>? (package-full-name package1)
> -                                (package-full-name package2))
> -                      (> score1 score2))))))))))
> +  (define (regexp? str)
> +    (string-any
> +     (char-set #\. #\[ #\{ #\} #\( #\) #\\ #\* #\+ #\? #\| #\^ #\$)
> +     str))

Instead of reverting this, I would let the current
'find-packages-by-description' and would add
'find-packages-by-description-indexed' doing just
'(search-package-index (current-profile) (string-join patterns " "))'.
And maybe refactoring the sort of scores. Then I would put the test
branch in 'guix/scripts/packages.scm'...


> +  (if (and (current-profile)
> +           (not (any regexp? patterns)))
> +      (search-package-index (current-profile) (string-join patterns " "))
> +      (let* ((regexps (map (cut make-regexp* <> regexp/icase) patterns))
> +             (matches (fold-packages (lambda (package result)
> +                                       (if (package-superseded package)
> +                                           result
> +                                           (match (package-relevance package

Note that I am in the process of implementing the BM25 weights as
'package-relevance'; at least really thinking about it! :-)
I have already talked about TF-IDF as relevance, for example here [1].
And reading the Xapian documentation [2], it seems affordable. Or not
;-) because of the regexp... Need some thoughts... I mean "in the
process". ;-)
And in this case, it is almost a drop-in replacement of
'fold-packages' by 'mset-fold'; well it should add some flexibility
and a more unified code.

(Aside the searching, IMHO 'package-relevance' should help too in the
linting process of bad written descriptions, another story. ;-)

[1] https://lists.gnu.org/archive/html/guix-devel/2019-07/msg00252.html
[2] https://xapian.org/docs/bm25.html


> +                                                                     regexps)
> +                                             ((? zero?)
> +                                              result)
> +                                             (score
> +                                              (cons (cons package score)
> +                                                    result)))))
> +                                     '())))
> +        (sort matches
> +              (lambda (m1 m2)
> +                (match m1
> +                  ((package1 . score1)
> +                   (match m2
> +                     ((package2 . score2)
> +                      (if (= score1 score2)
> +                          (string>? (package-full-name package1)
> +                                    (package-full-name package2))
> +                          (> score1 score2)))))))))))
>
>  (define (transaction-upgrade-entry store entry transaction)
>    "Return a variant of TRANSACTION that accounts for the upgrade of ENTRY, a
> @@ -777,8 +787,7 @@ processed, #f otherwise."

...here.

+  (define (regexp? str)
+    (string-any
+     (char-set #\. #\[ #\{ #\} #\( #\) #\\ #\* #\+ #\? #\| #\^ #\$)
+     str))

>                                        (('query 'search rx) rx)
>                                        (_                   #f))
>                                      opts))
>
> -              (regexps  (map (cut make-regexp* <> regexp/icase) patterns))
> -              (matches  (find-packages-by-description regexps)))

+ (if   (any regexp? patterns)
+    (matches (find-packages-by-description regexps))
+    (matches (find-packages-by-description-indexed patterns))

I mean something like that.

>           (leave-on-EPIPE
>            (display-search-results matches (current-output-port)))
>           #t))
> --
> 2.23.0


All the best,
simon
Simon Tournier March 3, 2020, 7:51 p.m. UTC | #3
On Tue, 3 Mar 2020 at 20:21, zimoun <zimon.toutoune@gmail.com> wrote:
> On Thu, 27 Feb 2020 at 21:42, Arun Isaac <arunisaac@systemreboot.net> wrote:

> > +(define (search-package-index profile querystring)
> > +  (let ((offset 0)
> > +        (pagesize 10))
>
> Why this value of 10?
> This fix the number of packages returned. Hum?
> I have tried to replace by 100 and I got 100 packages. :-)

I propose the value of 4294967295 for pagesize.
diff mbox series

Patch

diff --git a/gnu/packages.scm b/gnu/packages.scm
index e91753e2a8..5b5b29bf84 100644
--- a/gnu/packages.scm
+++ b/gnu/packages.scm
@@ -67,7 +67,8 @@ 
             specifications->manifest
 
             generate-package-cache
-            generate-package-search-index))
+            generate-package-search-index
+            search-package-index))
 
 ;;; Commentary:
 ;;;
@@ -453,6 +454,20 @@  reducing the memory footprint."
 
   db-path)
 
+(define (search-package-index profile querystring)
+  (let ((offset 0)
+        (pagesize 10))
+    (call-with-database (string-append profile %package-search-index)
+      (lambda (db)
+        (let ((query (parse-query querystring #:stemmer (make-stem "en"))))
+          (mset-fold (lambda (item result)
+                       (match (find-packages-by-name
+                               (document-data (mset-item-document item)))
+                         ((package _ ...)
+                          (append result `((,package . ,(mset-item-weight item)))))))
+                     '()
+                     (enquire-mset (enquire db query) offset pagesize)))))))
+
 
 (define %sigint-prompt
   ;; The prompt to jump to upon SIGINT.
diff --git a/guix/scripts/package.scm b/guix/scripts/package.scm
index 1cb0d382bf..6a3b9002dd 100644
--- a/guix/scripts/package.scm
+++ b/guix/scripts/package.scm
@@ -7,6 +7,7 @@ 
 ;;; Copyright © 2016 Benz Schenk <benz.schenk@uzh.ch>
 ;;; Copyright © 2016 Chris Marusich <cmmarusich@gmail.com>
 ;;; Copyright © 2019 Tobias Geerinckx-Rice <me@tobias.gr>
+;;; Copyright © 2020 Arun Isaac <arunisaac@systemreboot.net>
 ;;;
 ;;; This file is part of GNU Guix.
 ;;;
@@ -178,31 +179,40 @@  hooks\" run when building the profile."
 ;;; Package specifications.
 ;;;
 
-(define (find-packages-by-description regexps)
+(define (find-packages-by-description patterns)
   "Return a list of pairs: packages whose name, synopsis, description,
 or output matches at least one of REGEXPS sorted by relevance, and its
 non-zero relevance score."
-  (let ((matches (fold-packages (lambda (package result)
-                                  (if (package-superseded package)
-                                      result
-                                      (match (package-relevance package
-                                                                regexps)
-                                        ((? zero?)
-                                         result)
-                                        (score
-                                         (cons (cons package score)
-                                               result)))))
-                                '())))
-    (sort matches
-          (lambda (m1 m2)
-            (match m1
-              ((package1 . score1)
-               (match m2
-                 ((package2 . score2)
-                  (if (= score1 score2)
-                      (string>? (package-full-name package1)
-                                (package-full-name package2))
-                      (> score1 score2))))))))))
+  (define (regexp? str)
+    (string-any
+     (char-set #\. #\[ #\{ #\} #\( #\) #\\ #\* #\+ #\? #\| #\^ #\$)
+     str))
+
+  (if (and (current-profile)
+           (not (any regexp? patterns)))
+      (search-package-index (current-profile) (string-join patterns " "))
+      (let* ((regexps (map (cut make-regexp* <> regexp/icase) patterns))
+             (matches (fold-packages (lambda (package result)
+                                       (if (package-superseded package)
+                                           result
+                                           (match (package-relevance package
+                                                                     regexps)
+                                             ((? zero?)
+                                              result)
+                                             (score
+                                              (cons (cons package score)
+                                                    result)))))
+                                     '())))
+        (sort matches
+              (lambda (m1 m2)
+                (match m1
+                  ((package1 . score1)
+                   (match m2
+                     ((package2 . score2)
+                      (if (= score1 score2)
+                          (string>? (package-full-name package1)
+                                    (package-full-name package2))
+                          (> score1 score2)))))))))))
 
 (define (transaction-upgrade-entry store entry transaction)
   "Return a variant of TRANSACTION that accounts for the upgrade of ENTRY, a
@@ -777,8 +787,7 @@  processed, #f otherwise."
                                       (('query 'search rx) rx)
                                       (_                   #f))
                                     opts))
-              (regexps  (map (cut make-regexp* <> regexp/icase) patterns))
-              (matches  (find-packages-by-description regexps)))
+              (matches  (find-packages-by-description patterns)))
          (leave-on-EPIPE
           (display-search-results matches (current-output-port)))
          #t))