[bug#77638,8/8] linux-container: Lock mounts by default.

Message ID 1b6a3534339319d7e792f5888b51e02b27607d65.1744114408.git.ludo@gnu.org
State New
Headers
Series Harden 'call-with-container' |

Commit Message

Ludovic Courtès April 8, 2025, 12:24 p.m. UTC
  This makes it impossible to unmount or remount things from within
‘call-with-container’.

* gnu/build/linux-container.scm (initialize-user-namespace):
Add #:host-uid and #:host-gid. and honor them.
(run-container): Add #:lock-mounts?.  Honor it by calling ‘unshare’
followed by ‘initialize-user-namespace’.
(call-with-container): Add #:lock-mounts? and pass it down.
(container-excursion): Get the user namespace owning the PID namespace
and join it, then join the remaining namespaces.
* tests/containers.scm ("call-with-container, mnt namespace, locked mounts"):
New test.
("container-excursion"): Pass #:lock-mounts? #f.

Change-Id: I13be982aef99e68a653d472f0e595c81cfcfa392
---
 gnu/build/linux-container.scm | 111 ++++++++++++++++++++++------------
 tests/containers.scm          |  33 ++++++++--
 2 files changed, 103 insertions(+), 41 deletions(-)
  

Patch

diff --git a/gnu/build/linux-container.scm b/gnu/build/linux-container.scm
index 345ce2de08..51f04bc249 100644
--- a/gnu/build/linux-container.scm
+++ b/gnu/build/linux-container.scm
@@ -189,7 +189,10 @@  (define* (mount-file-systems root mounts #:key mount-/sys? mount-/proc?
       (remount-read-only "/"))))
 
 (define* (initialize-user-namespace pid host-uids
-                                    #:key (guest-uid 0) (guest-gid 0))
+                                    #:key
+                                    (host-uid (getuid))
+                                    (host-gid (getgid))
+                                    (guest-uid 0) (guest-gid 0))
   "Configure the user namespace for PID.  HOST-UIDS specifies the number of
 host user identifiers to map into the user namespace.  GUEST-UID and GUEST-GID
 specify the first UID (respectively GID) that host UIDs (respectively GIDs)
@@ -200,24 +203,21 @@  (define* (initialize-user-namespace pid host-uids
   (define (scope file)
     (string-append proc-dir file))
 
-  (let ((uid (getuid))
-        (gid (getgid)))
-
-    ;; Only root can write to the gid map without first disabling the
-    ;; setgroups syscall.
-    (unless (and (zero? uid) (zero? gid))
-      (call-with-output-file (scope "/setgroups")
-        (lambda (port)
-          (display "deny" port))))
-
-    ;; Map the user/group that created the container to the root user
-    ;; within the container.
-    (call-with-output-file (scope "/uid_map")
+  ;; Only root can write to the gid map without first disabling the
+  ;; setgroups syscall.
+  (unless (and (zero? host-uid) (zero? host-gid))
+    (call-with-output-file (scope "/setgroups")
       (lambda (port)
-        (format port "~d ~d ~d" guest-uid uid host-uids)))
-    (call-with-output-file (scope "/gid_map")
-      (lambda (port)
-        (format port "~d ~d ~d" guest-gid gid host-uids)))))
+        (display "deny" port))))
+
+  ;; Map the user/group that created the container to the root user
+  ;; within the container.
+  (call-with-output-file (scope "/uid_map")
+    (lambda (port)
+      (format port "~d ~d ~d" guest-uid host-uid host-uids)))
+  (call-with-output-file (scope "/gid_map")
+    (lambda (port)
+      (format port "~d ~d ~d" guest-gid host-gid host-uids))))
 
 (define (namespaces->bit-mask namespaces)
   "Return the number suitable for the 'flags' argument of 'clone' that
@@ -238,12 +238,14 @@  (define* (run-container root mounts namespaces host-uids thunk
                         #:key (guest-uid 0) (guest-gid 0)
                         (populate-file-system (const #t))
                         (loopback-network? #t)
+                        (lock-mounts? #t)
                         writable-root?)
   "Run THUNK in a new container process and return its PID.  ROOT specifies
 the root directory for the container.  MOUNTS is a list of <file-system>
 objects that specify file systems to mount inside the container.  NAMESPACES
 is a list of symbols that correspond to the possible Linux namespaces: mnt,
-ipc, uts, user, and net.
+ipc, uts, user, and net.  When LOCK-MOUNTS? is true, arrange so that none of
+MOUNTS can be unmounted or remounted individually from within THUNK.
 
 When LOOPBACK-NETWORK? is true and 'net is amount NAMESPACES, set up the
 loopback device (\"lo\") and a minimal /etc/hosts.
@@ -303,6 +305,28 @@  (define* (run-container root mounts namespaces host-uids thunk
                       ;; cannot be 'read' so they shouldn't be written as is.
                       (write args child)
                       (primitive-exit 3))))
+
+                (when (and lock-mounts?
+                           (memq 'mnt namespaces)
+                           (memq 'user namespaces))
+                  ;; Create a new mount namespace owned by a new user
+                  ;; namespace to "lock" together previous mounts, such that
+                  ;; they cannot be unmounted or remounted separately--see
+                  ;; mount_namespaces(7).
+                  ;;
+                  ;; Note: at this point, the process is single-threaded (no
+                  ;; GC mark threads, no finalization thread, etc.) which is
+                  ;; why unshare(CLONE_NEWUSER) can be used.
+                  (let ((uid (getuid)) (gid (getgid)))
+                    (unshare (logior CLONE_NEWUSER CLONE_NEWNS))
+                    (when (file-exists? "/proc/self")
+                      (initialize-user-namespace (getpid)
+                                                 host-uids
+                                                 #:host-uid uid
+                                                 #:host-gid gid
+                                                 #:guest-uid guest-uid
+                                                 #:guest-gid guest-gid))))
+
                 ;; TODO: Manage capabilities.
                 (write 'ready child)
                 (close-port child)
@@ -365,6 +389,7 @@  (define (status->exit-status status)
 
 (define* (call-with-container mounts thunk #:key (namespaces %namespaces)
                               (host-uids 1) (guest-uid 0) (guest-gid 0)
+                              (lock-mounts? #t)
                               (relayed-signals (list SIGINT SIGTERM))
                               (child-is-pid1? #t)
                               (populate-file-system (const #t))
@@ -449,6 +474,7 @@  (define* (call-with-container mounts thunk #:key (namespaces %namespaces)
   (call-with-temporary-directory
    (lambda (root)
      (let ((pid (run-container root mounts namespaces host-uids thunk*
+                               #:lock-mounts? lock-mounts?
                                #:guest-uid guest-uid
                                #:guest-gid guest-gid
                                #:populate-file-system populate-file-system
@@ -469,24 +495,35 @@  (define (container-excursion pid thunk)
     (0
      (call-with-clean-exit
       (lambda ()
-        (for-each (lambda (ns)
-                    (let ((source (namespace-file (getpid) ns))
-                          (target (namespace-file pid ns)))
-                      ;; Joining the namespace that the process already
-                      ;; belongs to would throw an error so avoid that.
-                      ;; XXX: This /proc interface leads to TOCTTOU.
-                      (unless (string=? (readlink source) (readlink target))
-                        (call-with-input-file source
-                          (lambda (current-ns-port)
-                            (call-with-input-file target
-                              (lambda (new-ns-port)
-                                (setns (fileno new-ns-port) 0))))))))
-                  ;; It's important that the user namespace is joined first,
-                  ;; so that the user will have the privileges to join the
-                  ;; other namespaces.  Furthermore, it's important that the
-                  ;; mount namespace is joined last, otherwise the /proc mount
-                  ;; point would no longer be accessible.
-                  '("user" "ipc" "uts" "net" "pid" "mnt"))
+        ;; First, determine the user namespace that owns the pid namespace and
+        ;; join that user namespace (the assumption is that it also owns all
+        ;; the other namespaces).  It's important that the user namespace is
+        ;; joined first, so that the user will have the privileges to join the
+        ;; other namespaces.
+        (let* ((pid-ns (open-fdes (namespace-file pid "pid")
+                                  (logior O_CLOEXEC O_RDONLY)))
+               (user-ns (get-user-ns pid-ns)))
+          (close-fdes pid-ns)
+          (unless (equal? (stat user-ns)
+                          (stat (namespace-file (getpid) "user")))
+            (setns user-ns 0))
+          (close-fdes user-ns)
+
+          ;; Then join all the remaining namespaces.
+          (for-each (lambda (ns)
+                      (let ((source (namespace-file (getpid) ns))
+                            (target (namespace-file pid ns)))
+                        ;; Joining the namespace that the process already
+                        ;; belongs to would throw an error so avoid that.
+                        ;; XXX: This /proc interface leads to TOCTTOU.
+                        (unless (string=? (readlink source) (readlink target))
+                          (call-with-input-file target
+                            (lambda (new-ns-port)
+                              (setns (fileno new-ns-port) 0))))))
+                    ;; It's important that the mount namespace is joined last,
+                    ;; otherwise the /proc mount point would no longer be
+                    ;; accessible.
+                    '("ipc" "uts" "net" "pid" "mnt")))
         (purify-environment)
         (chdir "/")
 
diff --git a/tests/containers.scm b/tests/containers.scm
index 1e915d517e..6edea9631d 100644
--- a/tests/containers.scm
+++ b/tests/containers.scm
@@ -1,6 +1,6 @@ 
 ;;; GNU Guix --- Functional package management for GNU
 ;;; Copyright © 2015 David Thompson <davet@gnu.org>
-;;; Copyright © 2016, 2017, 2019, 2023 Ludovic Courtès <ludo@gnu.org>
+;;; Copyright © 2016-2017, 2019, 2023, 2025 Ludovic Courtès <ludo@gnu.org>
 ;;;
 ;;; This file is part of GNU Guix.
 ;;;
@@ -110,6 +110,26 @@  (define (skip-if-unsupported)
        (assert-exit (file-exists? "/testing")))
      #:namespaces '(user mnt))))
 
+(skip-if-unsupported)
+(test-equal "call-with-container, mnt namespace, locked mounts"
+  EINVAL
+  ;; umount(2) fails with EINVAL when targeting a mount point that is
+  ;; "locked".
+  (status:exit-val
+   (call-with-container (list (file-system
+                                (device "none")
+                                (mount-point "/testing")
+                                (type "tmpfs")
+                                (check? #f)))
+     (lambda ()
+       (primitive-exit (catch 'system-error
+                         (lambda ()
+                           (umount "/testing")
+                           0)
+                         (lambda args
+                           (system-error-errno args)))))
+     #:namespaces '(user mnt))))
+
 (skip-if-unsupported)
 (test-equal "call-with-container, mnt namespace, wrong bind mount"
   `(system-error ,ENOENT)
@@ -169,7 +189,8 @@  (define (skip-if-unsupported)
      #:namespaces '(user mnt))))
 
 (skip-if-unsupported)
-(test-assert "container-excursion"
+(test-equal "container-excursion"
+  0
   (call-with-temporary-directory
    (lambda (root)
      ;; Two pipes: One for the container to signal that the test can begin,
@@ -193,7 +214,11 @@  (define (skip-if-unsupported)
                    (readlink (string-append "/proc/" pid "/ns/" ns)))
                  '("user" "ipc" "uts" "net" "pid" "mnt"))))
 
-        (let* ((pid (run-container root '() %namespaces 1 container))
+        (let* ((pid (run-container root '() %namespaces 1 container
+                                   ;; Do not lock mounts so the user namespace
+                                   ;; appears to be the same seen from inside
+                                   ;; and from outside.
+                                   #:lock-mounts? #f))
                (container-namespaces (namespaces pid))
                (result
                 (begin
@@ -213,7 +238,7 @@  (define (skip-if-unsupported)
           (write 'done end-out)
           (close end-out)
           (waitpid pid)
-          (zero? result)))))))
+          result))))))
 
 (skip-if-unsupported)
 (test-equal "container-excursion, same namespaces"