diff mbox series

[bug#49149,5/7] pack: Prevent duplicate files in tar archives.

Message ID 20210621061205.31878-6-maxim.cournoyer@gmail.com
State Accepted
Headers show
Series Add deb format for guix pack. | expand

Commit Message

Maxim Cournoyer June 21, 2021, 6:12 a.m. UTC
Tar translate duplicate files in the archive into hard links.  These can cause
problems, as not every tool support them; for example dpkg doesn't.

* gnu/system/file-systems.scm (reduce-directories): New procedure.
(file-prefix?): Lift the restriction on file prefix.  The procedure can be
useful for comparing relative file names.  Adjust doc.
(file-name-depth): New procedure, extracted from ...
(btrfs-store-subvolume-file-name): ... here.
* guix/scripts/pack.scm (self-contained-tarball/builder): Use
reduce-directories.
* tests/file-systems.scm ("reduce-directories"): New test.
---
 gnu/system/file-systems.scm | 56 +++++++++++++++++++++++++------------
 guix/scripts/pack.scm       |  6 ++--
 tests/file-systems.scm      |  7 ++++-
 3 files changed, 48 insertions(+), 21 deletions(-)

Comments

Ludovic Courtès June 30, 2021, 10:06 a.m. UTC | #1
Hi,

Maxim Cournoyer <maxim.cournoyer@gmail.com> skribis:

> Tar translate duplicate files in the archive into hard links.  These can cause
> problems, as not every tool support them; for example dpkg doesn't.
>
> * gnu/system/file-systems.scm (reduce-directories): New procedure.
> (file-prefix?): Lift the restriction on file prefix.  The procedure can be
> useful for comparing relative file names.  Adjust doc.
> (file-name-depth): New procedure, extracted from ...
> (btrfs-store-subvolume-file-name): ... here.
> * guix/scripts/pack.scm (self-contained-tarball/builder): Use
> reduce-directories.
> * tests/file-systems.scm ("reduce-directories"): New test.

[...]

>  (define (file-prefix? file1 file2)
> -  "Return #t if FILE1 denotes the name of a file that is a parent of FILE2,
> -where both FILE1 and FILE2 are absolute file name.  For example:
> +  "Return #t if FILE1 denotes the name of a file that is a parent of FILE2.
> +For example:
>  
>    (file-prefix? \"/gnu\" \"/gnu/store\")
>    => #t
> @@ -240,19 +241,41 @@ where both FILE1 and FILE2 are absolute file name.  For example:
>    (file-prefix? \"/gn\" \"/gnu/store\")
>    => #f
>  "
> -  (and (string-prefix? "/" file1)
> -       (string-prefix? "/" file2)

Doesn’t it have the effect that now:

  (file-prefix? "gnu" "/gnu/store") => #t

?

I’d rather insist on absolute file names and preserve the initial
semantics, to avoid bad surprises.


> +(define (reduce-directories file-names)
> +  "Eliminate entries in FILE-NAMES that are children of other entries in
> +FILE-NAMES.  This is for example useful when passing a list of files to GNU
> +tar, which would otherwise descend into each directory passed and archive the
> +duplicate files as hard links, which can be undesirable."
> +  (let* ((file-names/sorted
> +          ;; Ascending sort by file hierarchy depth, then by file name length.
> +          (stable-sort (delete-duplicates file-names)
> +                       (lambda (f1 f2)
> +                         (let ((depth1 (file-name-depth f1))
> +                               (depth2 (file-name-depth f2)))
> +                           (if (= depth1 depth2)
> +                               (string< f1 f2)
> +                               (< depth1 depth2)))))))
> +    (reverse (fold (lambda (file-name results)
> +                     (if (find (cut file-prefix? <> file-name) results)
> +                         results        ;parent found -- skipping
> +                         (cons file-name results)))
> +                   '()
> +                   file-names/sorted))))

Likewise, I suspect it doesn’t work as intended if there are relative
file names in the list, no?

Perhaps we could add an example to the docstring.  Also, the word
“reduce” doesn’t appear in the docstring, which to me suggests
suboptimal naming.  ;-)

Thanks,
Ludo’.
Maxim Cournoyer June 30, 2021, 6:16 p.m. UTC | #2
Hey,

Ludovic Courtès <ludo@gnu.org> writes:

[...]

>>  (define (file-prefix? file1 file2)
>> -  "Return #t if FILE1 denotes the name of a file that is a parent of FILE2,
>> -where both FILE1 and FILE2 are absolute file name.  For example:
>> +  "Return #t if FILE1 denotes the name of a file that is a parent of FILE2.
>> +For example:
>>  
>>    (file-prefix? \"/gnu\" \"/gnu/store\")
>>    => #t
>> @@ -240,19 +241,41 @@ where both FILE1 and FILE2 are absolute file name.  For example:
>>    (file-prefix? \"/gn\" \"/gnu/store\")
>>    => #f
>>  "
>> -  (and (string-prefix? "/" file1)
>> -       (string-prefix? "/" file2)
>
> Doesn’t it have the effect that now:
>
>   (file-prefix? "gnu" "/gnu/store") => #t
>
> ?

Good catch.  That seems sub-optimal.  How about:

--8<---------------cut here---------------start------------->8---
modified   gnu/system/file-systems.scm
@@ -233,6 +233,8 @@
 
 (define (file-prefix? file1 file2)
   "Return #t if FILE1 denotes the name of a file that is a parent of FILE2.
+FILE1 and FILE2 must both be either absolute or relative, else #f is returned.
+
 For example:
 
   (file-prefix? \"/gnu\" \"/gnu/store\")
@@ -241,17 +243,24 @@ For example:
   (file-prefix? \"/gn\" \"/gnu/store\")
   => #f
 "
-  (let loop ((file1 (string-tokenize file1 %not-slash))
-             (file2 (string-tokenize file2 %not-slash)))
-    (match file1
-      (()
-       #t)
-      ((head1 tail1 ...)
-       (match file2
-         ((head2 tail2 ...)
-          (and (string=? head1 head2) (loop tail1 tail2)))
-         (()
-          #f))))))
+  (define (absolute? file)
+    (string-prefix? "/" file))
+
+  (if (or (every absolute? (list file1 file2))
+          (every (negate absolute?) (list file1 file2)))
+      (let loop ((file1 (string-tokenize file1 %not-slash))
+                 (file2 (string-tokenize file2 %not-slash)))
+        (match file1
+          (()
+           #t)
+          ((head1 tail1 ...)
+           (match file2
+             ((head2 tail2 ...)
+              (and (string=? head1 head2) (loop tail1 tail2)))
+             (()
+              #f)))))
+      ;; FILE1 and FILE2 are a mix of absolute and relative paths.
+      #f))
--8<---------------cut here---------------end--------------->8---
 
 (define (file-name-depth file-name)
   (length (string-tokenize file-name %not-slash)))

> I’d rather insist on absolute file names and preserve the initial
> semantics, to avoid bad surprises.

I agree that not changing the original semantics would be safest;
nevertheless, we're talking about an internal helper that isn't widely
use; its couple usages are easy to review (and deals with mount points
which seems safe to assume are exclusively using absolute paths).
Especially after the above fix :-).

>> +(define (reduce-directories file-names)
>> +  "Eliminate entries in FILE-NAMES that are children of other entries in
>> +FILE-NAMES.  This is for example useful when passing a list of files to GNU
>> +tar, which would otherwise descend into each directory passed and archive the
>> +duplicate files as hard links, which can be undesirable."
>> +  (let* ((file-names/sorted
>> +          ;; Ascending sort by file hierarchy depth, then by file name length.
>> +          (stable-sort (delete-duplicates file-names)
>> +                       (lambda (f1 f2)
>> +                         (let ((depth1 (file-name-depth f1))
>> +                               (depth2 (file-name-depth f2)))
>> +                           (if (= depth1 depth2)
>> +                               (string< f1 f2)
>> +                               (< depth1 depth2)))))))
>> +    (reverse (fold (lambda (file-name results)
>> +                     (if (find (cut file-prefix? <> file-name) results)
>> +                         results        ;parent found -- skipping
>> +                         (cons file-name results)))
>> +                   '()
>> +                   file-names/sorted))))
>
> Likewise, I suspect it doesn’t work as intended if there are relative
> file names in the list, no?

You can see it at work in the tests/file-systems test module; it reduces

(reduce-directories '("./opt/gnu/etc" "./opt/gnu/" "./opt/gnu/bin"
                        "./opt/gnu/lib/debug" "./opt/gnuism" "a/b/c"
                        "a/b/c"))

into '("./opt/gnu/" "./opt/gnuism" "a/b/c"), none of which are absolute
file names.

> Perhaps we could add an example to the docstring.  Also, the word
> “reduce” doesn’t appear in the docstring, which to me suggests
> suboptimal naming.  ;-)

That the word 'reduce' doesn't appear in the docstring was a conscious
effort of mine to not bore the reader with repeating the same terms, ah!
But naming is hard; I'm open to suggestions.

Maxim
Ludovic Courtès July 1, 2021, 1:24 p.m. UTC | #3
Hi!

Maxim Cournoyer <maxim.cournoyer@gmail.com> skribis:


[...]

>>>  (define (file-prefix? file1 file2)
>>> -  "Return #t if FILE1 denotes the name of a file that is a parent of FILE2,
>>> -where both FILE1 and FILE2 are absolute file name.  For example:
>>> +  "Return #t if FILE1 denotes the name of a file that is a parent of FILE2.
>>> +For example:
>>>  
>>>    (file-prefix? \"/gnu\" \"/gnu/store\")
>>>    => #t
>>> @@ -240,19 +241,41 @@ where both FILE1 and FILE2 are absolute file name.  For example:
>>>    (file-prefix? \"/gn\" \"/gnu/store\")
>>>    => #f
>>>  "
>>> -  (and (string-prefix? "/" file1)
>>> -       (string-prefix? "/" file2)
>>
>> Doesn’t it have the effect that now:
>>
>>   (file-prefix? "gnu" "/gnu/store") => #t
>>
>> ?
>
> Good catch.  That seems sub-optimal.  How about:

[...]

> +  (define (absolute? file)
> +    (string-prefix? "/" file))
> +
> +  (if (or (every absolute? (list file1 file2))
> +          (every (negate absolute?) (list file1 file2)))

Yes, that could work.

>> I’d rather insist on absolute file names and preserve the initial
>> semantics, to avoid bad surprises.
>
> I agree that not changing the original semantics would be safest;
> nevertheless, we're talking about an internal helper that isn't widely
> use; its couple usages are easy to review (and deals with mount points
> which seems safe to assume are exclusively using absolute paths).
> Especially after the above fix :-).

Sure, but it’s always easier to reason about code that is stricter.

>>> +(define (reduce-directories file-names)
>>> +  "Eliminate entries in FILE-NAMES that are children of other entries in
>>> +FILE-NAMES.  This is for example useful when passing a list of files to GNU
>>> +tar, which would otherwise descend into each directory passed and archive the
>>> +duplicate files as hard links, which can be undesirable."
>>> +  (let* ((file-names/sorted
>>> +          ;; Ascending sort by file hierarchy depth, then by file name length.
>>> +          (stable-sort (delete-duplicates file-names)
>>> +                       (lambda (f1 f2)
>>> +                         (let ((depth1 (file-name-depth f1))
>>> +                               (depth2 (file-name-depth f2)))
>>> +                           (if (= depth1 depth2)
>>> +                               (string< f1 f2)
>>> +                               (< depth1 depth2)))))))
>>> +    (reverse (fold (lambda (file-name results)
>>> +                     (if (find (cut file-prefix? <> file-name) results)
>>> +                         results        ;parent found -- skipping
>>> +                         (cons file-name results)))
>>> +                   '()
>>> +                   file-names/sorted))))
>>
>> Likewise, I suspect it doesn’t work as intended if there are relative
>> file names in the list, no?
>
> You can see it at work in the tests/file-systems test module; it reduces
>
> (reduce-directories '("./opt/gnu/etc" "./opt/gnu/" "./opt/gnu/bin"
>                         "./opt/gnu/lib/debug" "./opt/gnuism" "a/b/c"
>                         "a/b/c"))
>
> into '("./opt/gnu/" "./opt/gnuism" "a/b/c"), none of which are absolute
> file names.

Oh right!

>> Perhaps we could add an example to the docstring.  Also, the word
>> “reduce” doesn’t appear in the docstring, which to me suggests
>> suboptimal naming.  ;-)
>
> That the word 'reduce' doesn't appear in the docstring was a conscious
> effort of mine to not bore the reader with repeating the same terms, ah!
> But naming is hard; I'm open to suggestions.

Actually I don’t have a good suggestion.  :-)
‘strip-child-directories’ maybe?

Thanks,
Ludo’.
diff mbox series

Patch

diff --git a/gnu/system/file-systems.scm b/gnu/system/file-systems.scm
index 464e87cb18..fb87bfc85b 100644
--- a/gnu/system/file-systems.scm
+++ b/gnu/system/file-systems.scm
@@ -55,6 +55,7 @@ 
             file-system-dependencies
             file-system-location
 
+            reduce-directories
             file-system-type-predicate
             btrfs-subvolume?
             btrfs-store-subvolume-file-name
@@ -231,8 +232,8 @@ 
   (char-set-complement (char-set #\/)))
 
 (define (file-prefix? file1 file2)
-  "Return #t if FILE1 denotes the name of a file that is a parent of FILE2,
-where both FILE1 and FILE2 are absolute file name.  For example:
+  "Return #t if FILE1 denotes the name of a file that is a parent of FILE2.
+For example:
 
   (file-prefix? \"/gnu\" \"/gnu/store\")
   => #t
@@ -240,19 +241,41 @@  where both FILE1 and FILE2 are absolute file name.  For example:
   (file-prefix? \"/gn\" \"/gnu/store\")
   => #f
 "
-  (and (string-prefix? "/" file1)
-       (string-prefix? "/" file2)
-       (let loop ((file1 (string-tokenize file1 %not-slash))
-                  (file2 (string-tokenize file2 %not-slash)))
-         (match file1
-           (()
-            #t)
-           ((head1 tail1 ...)
-            (match file2
-              ((head2 tail2 ...)
-               (and (string=? head1 head2) (loop tail1 tail2)))
-              (()
-               #f)))))))
+  (let loop ((file1 (string-tokenize file1 %not-slash))
+             (file2 (string-tokenize file2 %not-slash)))
+    (match file1
+      (()
+       #t)
+      ((head1 tail1 ...)
+       (match file2
+         ((head2 tail2 ...)
+          (and (string=? head1 head2) (loop tail1 tail2)))
+         (()
+          #f))))))
+
+(define (file-name-depth file-name)
+  (length (string-tokenize file-name %not-slash)))
+
+(define (reduce-directories file-names)
+  "Eliminate entries in FILE-NAMES that are children of other entries in
+FILE-NAMES.  This is for example useful when passing a list of files to GNU
+tar, which would otherwise descend into each directory passed and archive the
+duplicate files as hard links, which can be undesirable."
+  (let* ((file-names/sorted
+          ;; Ascending sort by file hierarchy depth, then by file name length.
+          (stable-sort (delete-duplicates file-names)
+                       (lambda (f1 f2)
+                         (let ((depth1 (file-name-depth f1))
+                               (depth2 (file-name-depth f2)))
+                           (if (= depth1 depth2)
+                               (string< f1 f2)
+                               (< depth1 depth2)))))))
+    (reverse (fold (lambda (file-name results)
+                     (if (find (cut file-prefix? <> file-name) results)
+                         results        ;parent found -- skipping
+                         (cons file-name results)))
+                   '()
+                   file-names/sorted))))
 
 (define* (file-system-device->string device #:key uuid-type)
   "Return the string representations of the DEVICE field of a <file-system>
@@ -624,9 +647,6 @@  store is located, else #f."
         s
         (string-append "/" s)))
 
-  (define (file-name-depth file-name)
-    (length (string-tokenize file-name %not-slash)))
-
   (and-let* ((btrfs-subvolume-fs (filter btrfs-subvolume? file-systems))
              (btrfs-subvolume-fs*
               (sort btrfs-subvolume-fs
diff --git a/guix/scripts/pack.scm b/guix/scripts/pack.scm
index 9d4bb9f497..8a108b7a1a 100644
--- a/guix/scripts/pack.scm
+++ b/guix/scripts/pack.scm
@@ -225,13 +225,15 @@  dependencies are registered."
                           `((guix build pack)
                             (guix build utils)
                             (guix build union)
-                            (gnu build install))
+                            (gnu build install)
+                            (gnu system file-systems))
                           #:select? import-module?)
     #~(begin
         (use-modules (guix build pack)
                      (guix build utils)
                      ((guix build union) #:select (relative-file-name))
                      (gnu build install)
+                     ((gnu system file-systems) #:select (reduce-directories))
                      (srfi srfi-1)
                      (srfi srfi-26)
                      (ice-9 match))
@@ -298,7 +300,7 @@  dependencies are registered."
 
                    ,(string-append "." (%store-directory))
 
-                   ,@(delete-duplicates
+                   ,@(reduce-directories
                       (filter-map (match-lambda
                                     (('directory directory)
                                      (string-append "." directory))
diff --git a/tests/file-systems.scm b/tests/file-systems.scm
index 7f7c373884..80acb6d5b9 100644
--- a/tests/file-systems.scm
+++ b/tests/file-systems.scm
@@ -1,6 +1,6 @@ 
 ;;; GNU Guix --- Functional package management for GNU
 ;;; Copyright © 2015, 2017 Ludovic Courtès <ludo@gnu.org>
-;;; Copyright © 2020 Maxim Cournoyer <maxim.cournoyer@gmail.com>
+;;; Copyright © 2020, 2021 Maxim Cournoyer <maxim.cournoyer@gmail.com>
 ;;;
 ;;; This file is part of GNU Guix.
 ;;;
@@ -50,6 +50,11 @@ 
                    (device "/foo")
                    (flags '(bind-mount read-only)))))))))
 
+(test-equal "reduce-directories"
+  '("./opt/gnu/" "./opt/gnuism" "a/b/c")
+  (reduce-directories '("./opt/gnu/etc" "./opt/gnu/" "./opt/gnu/bin"
+                        "./opt/gnu/lib/debug" "./opt/gnuism" "a/b/c" "a/b/c")))
+
 (test-assert "does not pull (guix config)"
   ;; This module is meant both for the host side and "build side", so make
   ;; sure it doesn't pull in (guix config), which depends on the user's