[bug#77387,1/2] man-db: Parse man macro arguments better.

Message ID 7068d5eb173f3d0c7cb69ae9592cb61b2dd84281.1743343624.git.sarg@sarg.org.ru
State New
Headers
Series man-db: Better parsing of man macros. |

Commit Message

Sergey Trofimov March 30, 2025, 2:32 p.m. UTC
  * guix/man-db.scm (man-macro-tokenize): New procedure to parse man
macros.
(man-page->entry): Parse macro line using man-macro-tokenize.

Change-Id: Iea0ffbc65290757df746138e0a6174646b5a3eb8
---
 guix/man-db.scm | 52 ++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 43 insertions(+), 9 deletions(-)
  

Comments

Ludovic Courtès April 1, 2025, 12:07 p.m. UTC | #1
Hi!

Glad you fixed this problem. :-)

Sergey Trofimov <sarg@sarg.org.ru> skribis:

> * guix/man-db.scm (man-macro-tokenize): New procedure to parse man
> macros.
> (man-page->entry): Parse macro line using man-macro-tokenize.
>
> Change-Id: Iea0ffbc65290757df746138e0a6174646b5a3eb8

[...]

> +(define (man-macro-tokenize input)

Could you add a docstring explaining what it takes and what it returns?

> +  (let loop ((pos 0)
> +             (tokens '())
> +             (current '())

Maybe s/current/characters/ ?

> +             (in-string? #f))
> +    (if (>= pos (string-length input))
> +        ;; End of input
> +        (unless in-string?
> +          (reverse (if (null? current)
> +                       tokens
> +                       (cons (list->string (reverse current)) tokens))))

So this procedure can return *unspecified*, right?  Sounds fishy.

> @@ -189,8 +223,8 @@ (define* (man-page->entry file #:optional (resolve identity))
>                (if (eof-object? line)
>                    (mandb-entry file name (or section 0) (or synopsis "")
>                                 kind)
> -                  (match (string-tokenize line)
> -                    ((".TH" name (= string->number* section) _ ...)
> +                  (match (if (string-prefix? "." line) (man-macro-tokenize line) #f)
> +                    ((".TH" name (= string->number section) _ ...)

Please add a comment above ‘match’ explaining what’s happening (why we
call ‘man-macro-tokenize’ etc.).

Also: (and (string-prefix? "." line) (man-macro-tokenize line))

Ludo’.
  
Sergey Trofimov April 1, 2025, 7:42 p.m. UTC | #2
Hi Ludovic,

I've sent an amended series.

Ludovic Courtès <ludo@gnu.org> writes:

>> +             (in-string? #f))
>> +    (if (>= pos (string-length input))
>> +        ;; End of input
>> +        (unless in-string?
>> +          (reverse (if (null? current)
>> +                       tokens
>> +                       (cons (list->string (reverse current)) tokens))))
>
> So this procedure can return *unspecified*, right?  Sounds fishy.
>
Why is it fishy? Is it unconventional? Such return value is handled
correctly by the calling code (`match`).
  

Patch

diff --git a/guix/man-db.scm b/guix/man-db.scm
index bba90ed473..44c01ac298 100644
--- a/guix/man-db.scm
+++ b/guix/man-db.scm
@@ -161,16 +161,50 @@  (define (read-synopsis port)
       (line
        (loop (cons line lines))))))
 
+(define (man-macro-tokenize input)
+  (let loop ((pos 0)
+             (tokens '())
+             (current '())
+             (in-string? #f))
+    (if (>= pos (string-length input))
+        ;; End of input
+        (unless in-string?
+          (reverse (if (null? current)
+                       tokens
+                       (cons (list->string (reverse current)) tokens))))
+        (let ((c (string-ref input pos)))
+          (cond
+           ;; Inside a string
+           (in-string?
+            (if (char=? c #\")
+                (if (and (< (+ pos 1) (string-length input))
+                         (char=? (string-ref input (+ pos 1)) #\"))
+                    ;; Double quote inside string
+                    (loop (+ pos 2) tokens (cons #\" current) #t)
+                    ;; End of string
+                    (loop (+ pos 1) (cons (list->string (reverse current)) tokens) '() #f))
+                ;; Regular character in string
+                (loop (+ pos 1) tokens (cons c current) #t)))
+
+           ;; Whitespace outside string
+           ((char-whitespace? c)
+            (if (null? current)
+                (loop (+ pos 1) tokens '() #f)
+                (loop (+ pos 1) (cons (list->string (reverse current)) tokens) '() #f)))
+
+           ;; Start of string
+           ((char=? c #\")
+            (if (null? current)
+                (loop (+ pos 1) tokens '() #t)
+                (loop pos (cons (list->string (reverse current)) tokens) '() #f)))
+
+           ;; Symbol character
+           (else
+            (loop (+ pos 1) tokens (cons c current) #f)))))))
+
 (define* (man-page->entry file #:optional (resolve identity))
   "Parse FILE, a gzip or zstd compressed man page, and return a <mandb-entry>
 for it."
-  (define (string->number* str)
-    (if (and (string-prefix? "\"" str)
-             (> (string-length str) 1)
-             (string-suffix? "\"" str))
-        (string->number (string-drop (string-drop-right str 1) 1))
-        (string->number str)))
-
   (define call-with-input-port*
     (cond
      ((gzip-compressed? file) call-with-gzip-input-port)
@@ -189,8 +223,8 @@  (define* (man-page->entry file #:optional (resolve identity))
               (if (eof-object? line)
                   (mandb-entry file name (or section 0) (or synopsis "")
                                kind)
-                  (match (string-tokenize line)
-                    ((".TH" name (= string->number* section) _ ...)
+                  (match (if (string-prefix? "." line) (man-macro-tokenize line) #f)
+                    ((".TH" name (= string->number section) _ ...)
                      (loop name section synopsis kind))
                     ((".SH" (or "NAME" "\"NAME\""))
                      (loop name section (read-synopsis port) kind))