[bug#78051,WIP,v6] services: root-file-system: In 'stop' method, find and kill processes that are writing to our filesystems, and then umount the filesystems.
Commit Message
* gnu/services/base.scm (%root-file-system-shepherd-service): In 'stop'
method, find and kill processes that are writing to our filesystems, and then
umount the filesystems.
Change-Id: Ib0ffff2257dca5fff3df99fea2d5de81a9612336
---
gnu/services/base.scm | 2860 +++++++++++++++++++++++------------------
1 file changed, 1627 insertions(+), 1233 deletions(-)
@@ -61,15 +61,15 @@ (define-module (gnu services base)
#:use-module (gnu packages admin)
#:use-module ((gnu packages linux)
#:select (alsa-utils btrfs-progs crda eudev
- e2fsprogs f2fs-tools fuse gpm kbd lvm2 rng-tools
- util-linux xfsprogs))
+ e2fsprogs f2fs-tools fuse gpm kbd lvm2 rng-tools
+ util-linux xfsprogs))
#:use-module (gnu packages bash)
#:use-module ((gnu packages base)
#:select (coreutils glibc glibc/hurd
- glibc-utf8-locales
- libc-utf8-locales-for-target
- make-glibc-utf8-locales
- tar canonical-package))
+ glibc-utf8-locales
+ libc-utf8-locales-for-target
+ make-glibc-utf8-locales
+ tar canonical-package))
#:use-module ((gnu packages cross-base)
#:select (cross-libc))
#:use-module ((gnu packages compression) #:select (gzip))
@@ -346,12 +346,373 @@ (define %root-file-system-shepherd-service
(shepherd-service
(documentation "Take care of the root file system.")
(provision '(root-file-system))
+ ;; Is it possible to have (gnu build linux-boot) loaded already?
+ ;; In that case, I'd like to move a lot of stuff there.
+ (modules '((ice-9 textual-ports)
+ (ice-9 control)
+ (ice-9 string-fun)
+ (ice-9 match)
+ (ice-9 ftw) ; scandir
+ (ice-9 rdelim)
+ (srfi srfi-1) ; filter, for-each, find.
+ (srfi srfi-26) ; cut
+ (ice-9 exceptions))) ; guard
+ ; TODO (guix build syscalls)
(start #~(const #t))
(stop #~(lambda _
- ;; Return #f if successfully stopped.
+ ;;; Return #f if successfully stopped.
+
+ ;;; Beginning of inlined module (fuser)
+
+ (define log (make-parameter (lambda args
+ (apply format (current-error-port) args))))
+ (define *proc-dir-name* "/proc")
+ (define *default-silent-errors*
+ (list ENOENT ESRCH))
+
+ (define* (call-with-safe-syscall thunk
+ #:key
+ (on-error #f)
+ (silent-errors *default-silent-errors*)
+ (error-message-format #f)
+ (error-context '()))
+ "Call THUNK, handling system errors:
+- If ERROR-MESSAGE-FORMAT and the error is not in SILENT-ERRORS, calls format
+with ERROR-MESSAGE-FORMAT and ERROR-CONTEXT and (strerror errno) as arguments.
+- Return ON-ERROR on error."
+ (catch 'system-error
+ thunk
+ (lambda args
+ (let ((errno (system-error-errno args)))
+ (unless (member errno silent-errors)
+ (when error-message-format
+ (apply (log)
+ error-message-format
+ (append
+ error-context
+ (list (strerror errno))))))
+ on-error))))
+
+ (define (safe-stat path)
+ "Get stat info for PATH--or #f if not possible."
+ (call-with-safe-syscall (lambda () (stat path))
+ #:error-message-format "Error: Cannot stat ~s: ~a~%"
+ #:error-context (list path)
+ #:silent-errors '()
+ #:on-error #f))
+
+ (define (safe-umount path) ; TODO: UMOUNT_NOFOLLOW ?
+ "Umount PATH--if possible.."
+ (call-with-safe-syscall (lambda () (umount path))
+ #:error-message-format "Error: Cannot umount ~s: ~a~%"
+ #:error-context (list path)
+ #:silent-errors '()
+ #:on-error 'error))
+
+ (define (safe-lstat path)
+ "Get lstat info for PATH--or #f if not possible."
+ (call-with-safe-syscall (lambda () (lstat path))
+ #:error-message-format "Error: Cannot lstat ~s: ~a~%"
+ #:error-context (list path)
+ #:on-error #f))
+
+ (define (safe-scandir path)
+ "scandir PATH--or #f if not possible."
+ (let ((result (scandir path)))
+ (if result
+ result
+ (begin
+ ((log) "Error: Cannot scandir ~s: ?~%" path)
+ '()))))
+
+;;; Processes
+
+ (define (safe-get-fd-flags pid fd)
+ "Get flags for FD in PID--or #f if not possible."
+ (let ((fdinfo-path (format #f "~a/~a/fdinfo/~a" *proc-dir-name* pid fd)))
+ (call-with-safe-syscall (lambda ()
+ (call-with-input-file fdinfo-path
+ (lambda (port)
+ ;; Find 'flags:' line and parse octal value
+ (let loop ()
+ (let ((line (get-line port)))
+ (cond ((eof-object? line) #f)
+ ((string-prefix? "flags:\t" line)
+ (match (string-split line #\tab)
+ ((_ flags-str)
+ (catch 'invalid-argument
+ (lambda ()
+ (string->number flags-str 8))
+ (lambda args
+ #f)))
+ (_ #f)))
+ (else (loop))))))))
+ #:error-message-format "Error: Cannot read ~s: ~a~%"
+ #:error-context (list fdinfo-path)
+ #:on-error #f)))
+
+ (define (safe-get-processes)
+ "Get a list of all PIDs from proc--or #f if not possible."
+ (let ((proc-dir *proc-dir-name*))
+ (catch 'system-error
+ (lambda ()
+ ;; Keep only numbers.
+ (filter-map string->number (safe-scandir proc-dir)))
+ ;; FIXME is errno even useful?
+ (lambda scan-err
+ ((log) "Error scanning ~s: ~a~%"
+ proc-dir (strerror (system-error-errno scan-err)))
+ '()))))
+
+ (define (safe-fd-on-device? pid fd target-device)
+ "Return whether fd FD on pid PID is on device TARGET-DEVICE."
+ (let* ((fd-path (readlink (format #f "~a/~a/fd/~a" *proc-dir-name* pid fd)))
+ (stat (safe-lstat fd-path)))
+ (and stat (eqv? (stat:dev stat)
+ target-device))))
+
+ (define (safe-get-process-fds pid)
+ "Get a list of all FDs of PID from proc--or #f if not possible."
+ (let ((fd-dir (format #f "~a/~a/fd" *proc-dir-name* pid)))
+ ;; Keep only numbers.
+ (filter-map string->number (safe-scandir fd-dir))))
+
+ (define (filter-process-fd-flags pid fds predicate)
+ "Get FLAGS from proc for PID and call PREDICATE with (FD FLAGS) each."
+ (filter (lambda (fd)
+ (predicate fd (safe-get-fd-flags pid fd)))
+ fds))
+
+ (define (safe-get-process-command pid)
+ "Return command of process PID--or #f if not possible."
+ (let ((cmdline-path (format #f "~a/~a/cmdline" *proc-dir-name* pid)))
+ (call-with-safe-syscall (lambda ()
+ (call-with-input-file cmdline-path
+ (lambda (port)
+ (let ((full-cmdline (get-string-all port)))
+ (match (string-split full-cmdline #\nul)
+ ((command-name . _) command-name))))))
+ #:error-message-format "Error: Cannot read ~s: ~a~%"
+ #:error-context (list cmdline-path)
+ #:on-error #f)))
+
+ (define (safe-kill-process pid kill-signal)
+ "Kill process PID with KILL-SIGNAL if possible."
+ (call-with-safe-syscall (lambda ()
+ (kill pid kill-signal)
+ #t)
+ #:on-error 'error
+ #:silent-errors '()
+ #:error-message-format
+ "Error: Failed to kill process ~a: ~a~%"
+ #:error-context '()))
+
+;;; Mounts
+
+ (define (safe-get-device mount-point)
+ "Get the device ID (st_dev) of MOUNT-POINT--or #f if not possible."
+ (and=>
+ (safe-stat mount-point) ; TODO: lstat? Is that safe?
+ stat:dev))
+
+ (define (safe-parse-mountinfo path)
+ "Read and parse /proc/self/mountinfo (or specified path).
+Return a list of parsed entries, where each entry is:
+(list mount-id parent-id mount-point-string)
+Return '() on file read error or if file is unparseable."
+ (call-with-safe-syscall ; TODO: call-with-input-file is not actually a syscall.
+ (lambda ()
+ (let ((entries '()))
+ (call-with-input-file path
+ (lambda (port)
+ (let loop ()
+ (let ((line (get-line port)))
+ (unless (eof-object? line)
+ (match (string-split line #\space)
+ ((mount-id-str parent-id-str major-minor root mount-point rest ...)
+ ;; Attempt to parse IDs, skip line on error
+ (catch 'invalid-argument
+ (lambda ()
+ (let ((mount-id (string->number mount-id-str))
+ (parent-id (string->number parent-id-str)))
+ ;; Add successfully parsed entry to list
+ (set! entries (cons (list mount-id parent-id mount-point)
+ entries))
+ (loop)))
+ (lambda args
+ ((log)
+ "Warning: Skipping mountinfo line due to parse error: ~s (~a)~%"
+ line args)
+ (loop))))
+ (x (begin
+ ((log) "Warning: Skipping mountinfo line: %s" x)
+ (loop)))))))))
+ ;; Return parsed entries in file order
+ (reverse entries)))
+ #:error-message-format "Error: Cannot read or parse mountinfo file ~s: ~a"
+ #:error-context (list path)
+ #:on-error '(error)))
+
+ (define (safe-find-nested-mounts root-mount-point target-device)
+ "Find mount points that block the unmounting of ROOT-MOUNT-POINT.
+TARGET-DEVICE argument is ignored.
+Mountpoints are returned depth-first (in the order they can be unmounted).
+ROOT-MOUNT-POINT is included."
+ (let* ((mountinfo (safe-parse-mountinfo (format #f "~a/self/mountinfo" *proc-dir-name*))))
+ (define (safe-find-mounts-via-mountinfo accumulator lives root-mount-point)
+ (if (member root-mount-point accumulator)
+ ((log) "Cycle detected~%"))
+ (let ((accumulator (cons root-mount-point accumulator)))
+ (if (= lives 0)
+ (begin
+ ((log) "Error: Recursive mountpoints too deep.~%")
+ accumulator)
+ (let ((root-entry (find (lambda (entry)
+ (match entry
+ ((_ _ mp) (string=? mp root-mount-point))
+ (_ #f))) ; Should not happen
+ mountinfo)))
+ (if root-entry
+ (let ((root-mount-id (car root-entry)))
+ (fold (lambda (entry accumulator)
+ (match entry
+ ((_ parent-id mp)
+ (if (= parent-id root-mount-id)
+ (safe-find-mounts-via-mountinfo accumulator
+ (- lives 1)
+ mp)
+ accumulator))
+ (_ accumulator)))
+ accumulator
+ mountinfo))
+ (begin
+ ((log) "Error: Could not find mount ID for ~s in parsed mountinfo~%"
+ root-mount-point)
+ accumulator))))))
+ (safe-find-mounts-via-mountinfo '() 100 root-mount-point)))
+
+ ;;; End of inlined module (fuser)
+
+ (define *root-mount-point* "/")
+
+ (define O_ACCMODE #o0003)
+
+ (define (flags-has-write-access? flags)
+ "Given open FLAGS, return whether it (probably) signifies write access."
+ (and flags (not (= (logand flags O_ACCMODE)
+ O_RDONLY))))
+
+ (define (kill-process? pid command)
+ "Return whether to kill process with id PID (and command COMMAND)"
+ ((log) "~%Process Found: PID ~a Command: ~s~%" pid command)
+ #t
+ ;((log) "Kill process ~a? [y/N] " pid)
+ ;(force-output (current-error-port))
+ ;(let ((response (read-char (current-input-port))))
+ ; (if (not (eof-object? response))
+ ; ;; Consume rest of line.
+ ; (read-line (current-input-port)))
+ ; (or (eqv? response #\y)
+ ; (eqv? response #\Y)))
+ )
+
(sync)
- (let ((null (%make-void-port "w")))
+ (let* ((null (%make-void-port "w"))
+ (call-with-io-file (lambda (file-name proc)
+ (let ((port (open file-name O_RDWR)))
+ (set-current-input-port port)
+ (set-current-output-port port)
+ (set-current-error-port port)
+ (catch #t (lambda ()
+ (proc)
+ (set-current-input-port null)
+ (set-current-output-port null)
+ (set-current-error-port null)
+ (close port))
+ (lambda args
+ (set-current-input-port null)
+ (set-current-output-port null)
+ (set-current-error-port null)
+ (close port)))))))
+ (let-syntax ((with-mounted-filesystem (syntax-rules ()
+ ((_ source mountpoint file-system-type flags options exp ...)
+ (call-with-mounted-filesystem source mountpoint file-system-type flags options
+ (lambda () (begin exp ...)))))))
+
+ (define (call-with-logging thunk)
+ (mkdir-p "/proc")
+ (mkdir-p "/dev")
+ (with-mounted-filesystem "none" "/proc" "proc" 0 #f ; TODO: MS_NODEV, MS_NOEXEC, MS_NOSUID
+ (with-mounted-filesystem "none" "/dev" "devtmpfs" 0 #f ; TODO: MS_NOEXEC, MS_NOSUID
+ (catch 'system-error
+ (lambda ()
+ (mknod "/dev/console" 'char-special #o600 (+ (* 5 256) 1)))
+ (const #f))
+ (catch 'system-error
+ (lambda ()
+ (mknod "/dev/tty0" 'char-special #o600 (+ (* 4 256) 0)))
+ (const #f))
+ (call-with-io-file "/dev/console" ; TODO: /dev/console after we set it up using vt-set-as-console at boot or something (see plymouth).
+ (lambda ()
+ ;(vt-activate (current-input-port) 12)
+ (thunk))))))
+
+ (define (get-clean-ups)
+ ;; We rarely (or ever) log--and if we did have a logger
+ ;; at all times, we'd show up on our own shitlist.
+ ;; So: open logger, log, close logger--on every message.
+ (parameterize ((log (lambda args
+ (call-with-logging
+ (lambda ()
+ (format (current-error-port) args))))))
+ (let* ((root-device (safe-get-device *root-mount-point*))
+ (mounts (safe-find-nested-mounts *root-mount-point* root-device))
+ (mount-devices (map safe-get-device mounts)))
+ (let* ((our-pid (getpid))
+ (pids (filter (lambda (pid)
+ (not (= pid our-pid)))
+ (safe-get-processes)))
+ (pids (filter (lambda (pid)
+ (match (filter-process-fd-flags pid
+ (safe-get-process-fds pid)
+ (lambda (fd flags)
+ (and (flags-has-write-access? flags)
+ (find (lambda (target-device)
+ (safe-fd-on-device? pid fd target-device))
+ mount-devices))))
+ ((x . _) #t)
+ (_ #f)))
+ pids)))
+ (list pids mounts mount-devices)))))
+
+ (define (call-with-mounted-filesystem source mountpoint file-system-type flags options proc)
+ (mount source mountpoint file-system-type flags options #:update-mtab? #f)
+ (catch #t
+ (lambda ()
+ (proc)
+ (umount mountpoint))
+ (lambda args
+ (umount mountpoint))))
+
+ ;; This will also take care of setting up a logger for the
+ ;; entire runtime of the function.
+ (define (kill-processes pids mounts mount-devices signal)
+ (call-with-logging
+ (lambda ()
+ (parameterize ((log (lambda args
+ (apply format (current-error-port) args))))
+ (let ((error-port (current-error-port)))
+ (format error-port "Searched for processes writing on devices ~s (mount points ~s)...~%" mount-devices mounts)
+ (format error-port "Found ~a process(es) matching the criteria.~%" (length pids))
+ (for-each (lambda (pid)
+ (let ((command (safe-get-process-command pid)))
+ (if (kill-process? pid command)
+ (safe-kill-process pid signal)
+ (format error-port "Skipping PID ~a (~s).~%" pid command))))
+ pids)
+ (format error-port "~%Process scan complete.~%"))))))
+
;; Redirect the default output ports.
(set-current-output-port null)
(set-current-error-port null)
@@ -363,12 +724,45 @@ (define %root-file-system-shepherd-service
;; root file system can be re-mounted read-only.
(let loop ((n 10))
(unless (catch 'system-error
- (lambda ()
- (mount #f "/" #f
- (logior MS_REMOUNT MS_RDONLY)
- #:update-mtab? #f)
- #t)
- (const #f))
+ (lambda ()
+ (mount #f "/" #f
+ (logior MS_REMOUNT MS_RDONLY)
+ #:update-mtab? #f)
+ #t)
+ (const #f))
+ (when (zero? n)
+ ;; 1. Send SIGTERM to all writing processes (if any)
+ (match (get-clean-ups)
+ ((pids mounts mount-devices)
+ (when (> (length pids) 0)
+ (kill-processes pids mounts mount-devices SIGTERM)
+ ((@ (fibers) sleep) 5))))
+
+ ;; 2. Send SIGKILL to all writing processes
+ (match (get-clean-ups)
+ ((pids mounts mount-devices)
+ (when (> (length pids) 0)
+ (kill-processes pids mounts mount-devices SIGKILL)
+ ((@ (fibers) sleep) 5))
+
+ ;; 3. Unmount filesystems
+ (for-each safe-umount mounts)))
+
+ ;; Should have been unmounted already--but we are paranoid
+ ;; (and possibly were blocking ourselves anyway).
+ (catch 'system-error
+ (lambda ()
+ (mount #f "/" #f
+ (logior MS_REMOUNT MS_RDONLY)
+ #:update-mtab? #f)
+ ((@ (fibers) sleep) 5) ; just in case
+ #t)
+ (lambda args
+ ((log) "failed to remount / ro %s %~" args)
+ (let loopity ((q 0))
+ ((log) "user, do something!~%")
+ ((@ (fibers) sleep) 1)
+ (loopity (+ q 1)))))))
(unless (zero? n)
;; Yield to the other fibers. That gives logging fibers
;; an opportunity to close log files so the 'mount' call