diff mbox series

[bug#74542,v2,12/16] gnu-maintenance: ‘generic-html’ update honors <base href=" … ">.

Message ID 112b57b3d8cf1208f3390602dfab6932fac7c505.1732872499.git.ludo@gnu.org
State New
Headers show
Series Improved tooling for package updates | expand

Commit Message

Ludovic Courtès Nov. 29, 2024, 9:40 a.m. UTC
This fixes updates of ‘curl’: <https://curl.se/download/> includes
<base href="…"> in its head and ignoring it would lead to incorrect
download URLs.

* guix/gnu-maintenance.scm (html-links): Keep track of <base href="…">
in ‘loop’.  Rewrite relative links at the end.

Change-Id: I989da78df3431034c9a584f8e10cad87ae6dc920
---
 guix/gnu-maintenance.scm | 41 +++++++++++++++++++++++++++-------------
 1 file changed, 28 insertions(+), 13 deletions(-)
diff mbox series

Patch

diff --git a/guix/gnu-maintenance.scm b/guix/gnu-maintenance.scm
index b612b11c00..ee4882326f 100644
--- a/guix/gnu-maintenance.scm
+++ b/guix/gnu-maintenance.scm
@@ -39,6 +39,7 @@  (define-module (guix gnu-maintenance)
   #:use-module (guix utils)
   #:use-module (guix diagnostics)
   #:use-module (guix i18n)
+  #:autoload   (guix combinators) (fold2)
   #:use-module (guix memoization)
   #:use-module (guix records)
   #:use-module (guix upstream)
@@ -483,19 +484,33 @@  (define* (import-release* package #:key (version #f))
 
 (define (html-links sxml)
   "Return the list of links found in SXML, the SXML tree of an HTML page."
-  (let loop ((sxml sxml)
-             (links '()))
-    (match sxml
-      (('a ('@ attributes ...) body ...)
-       (match (assq 'href attributes)
-         (#f          (fold loop links body))
-         (('href url) (fold loop (cons url links) body))))
-      ((tag ('@ _ ...) body ...)
-       (fold loop links body))
-      ((tag body ...)
-       (fold loop links body))
-      (_
-       links))))
+  (define-values (links base)
+    (let loop ((sxml sxml)
+               (links '())
+               (base #f))
+      (match sxml
+        (('a ('@ attributes ...) body ...)
+         (match (assq 'href attributes)
+           (#f          (fold2 loop links base body))
+           (('href url) (fold2 loop (cons url links) base body))))
+        (('base ('@ ('href new-base)))
+         ;; The base against which relative URL paths must be resolved.
+         (values links new-base))
+        ((tag ('@ _ ...) body ...)
+         (fold2 loop links base body))
+        ((tag body ...)
+         (fold2 loop links base body))
+        (_
+         (values links base)))))
+
+  (if base
+      (map (lambda (link)
+             (let ((uri (string->uri link)))
+               (if (or uri (string-prefix? "/" link))
+                   link
+                   (in-vicinity base link))))
+           links)
+      links))
 
 (define (url->links url)
   "Return the unique links on the HTML page accessible at URL."