[bug#73660] gexp: Improve support of Unicode characters.

Message ID a0c437bad0b83665734831f7de7fc7e1f6972128.1728229346.git.~@wolfsden.cz
State New
Headers
Series [bug#73660] gexp: Improve support of Unicode characters. |

Commit Message

Tomas Volf Oct. 6, 2024, 3:42 p.m. UTC
Support for non-ASCII characters was mixed.  Some gexp forms did support them,
while others did not.  Combined with current value for
%default-port-conversion-strategy, that sometimes led to unpleasant surprises.
For example:

    (scheme-file "utf8" #~(with-output-to-file #$output
                            (λ _ (display "猫"))))

Was written to the store as:

    ((? _ (display "\u732b")))

No, that is not font issue on your part, that is an actual #\? instead of the
lambda character.  Which, surprisingly, does not do what it should when
executed.

The solution is to switch to C.UTF-8 locale where possible, since it is now
always available.  Or to explicitly set the port encoding.

No tests are provided, since majority of tests/gexp.scm use guile in version
2, and it tends to work under it.  The issues occur mostly with guile 3.

I did test it locally using:

      #!/bin/sh
      set -eu
      set -x

      [ -f guix.scm ] || { echo >&2 Run from root of Guix repo.; exit 1; }
      [ -f gnu.scm  ] || { echo >&2 Run from root of Guix repo.; exit 1; }

      cat >猫.scm <<'EOF'
      (define-module (猫)
        #:export (say))

      (define (say)
        "nyaaaa~~~~!")
      EOF

      mkdir -p dir-with-utf8-file
      cp 猫.scm dir-with-utf8-file/

      cat >repro.scm <<'EOF'
      (use-modules (guix build utils)
                   (guix derivations)
                   (guix gexp)
                   (guix store)
                   (ice-9 ftw)
                   (ice-9 textual-ports))

      (define cat "猫")

      (define (drv-content drv)
        (call-with-input-file (derivation->output-path drv)
          get-string-all))

      (define (out-content out)
        (call-with-input-file out
          get-string-all))

      (define (drv-listing drv)
        (scandir (derivation->output-path drv)))

      (define (dir-listing dir)
        (scandir dir))

      (define-macro (test exp lower? report)
        (let ((type (car exp)))
          `(false-if-exception
            (let ((drv (with-store %store
                         (run-with-store %store
                           (,(if lower? lower-object identity) ,exp)))))
              (format #t "~%~a:~%" ',type)
              (when (with-store %store
                      (build-derivations %store (list drv)))
                (format #t "~a~%" (,report drv)))))))

      (test (computed-file "utf8"
                           #~(with-output-to-file #$output
                               (λ _ (display #$cat))))
            #t drv-content)

      (test (program-file "utf8"
                          #~((λ _ (display #$cat))))
            #t drv-content)

      (test (scheme-file "utf8"
                         #~((λ _ (display #$cat))))
            #t drv-content)

      (test (text-file* "utf8" cat cat cat)
            #f drv-content)

      (test (compiled-modules '((猫)))
            #f drv-listing)

      (test (file-union "utf8" `((,cat ,(plain-file "utf8" cat))))
            #t drv-listing)

      ;;; No fix needed:
      (test (imported-modules '((猫)))
            #f dir-listing)

      (test (local-file "dir-with-utf8-file" #:recursive? #t)
            #t dir-listing)

      (test (plain-file "utf8" cat)
            #t out-content)

      (test (mixed-text-file "utf8" cat cat cat)
            #t drv-content)

      (test (directory-union "utf8" (list (local-file "dir-with-utf8-file"
                                                      #:recursive? #t)))
            #t dir-listing)
      EOF

      guix shell -CWN -D guix glibc-locales -- \
           env LANG=C.UTF-8 ./pre-inst-env guix repl -- ./repro.scm

Before this commit, the output is:

      + '[' -f guix.scm ']'
      + '[' -f gnu.scm ']'
      + cat
      + mkdir -p dir-with-utf8-file
      + cp 猫.scm dir-with-utf8-file/
      + cat
      + guix shell -CWN -D guix glibc-locales -- env LANG=C.UTF-8 ./pre-inst-env guix repl -- ./repro.scm

      computed-file:
      ?

      program-file:
      #!/gnu/store/mfkz7fvlfpv3ppwbkv0imb19nrf95akf-guile-3.0.9/bin/guile --no-auto-compile
      !#
      ((? _ (display "\u732b")))

      scheme-file:
      ((? _ (display "\u732b")))

      text-file*:
      ???

      compiled-modules:
      building path(s) `/gnu/store/ay3jifyvliigfgnz67jf0kgngzpya5a5-module-import-compiled'
      Backtrace:
                 5 (primitive-load "/gnu/store/rn7b0dq6iqfmmqyqzamix2mjmfy?")
      In ice-9/eval.scm:
          619:8  4 (_ #f)
      In srfi/srfi-1.scm:
         460:18  3 (fold #<procedure 7ffff79245e0 at ice-9/eval.scm:336:1?> ?)
      In ice-9/eval.scm:
         245:16  2 (_ #(#(#<directory (guix build utils) 7ffff779f320>) # ?))
      In ice-9/boot-9.scm:
        1982:24  1 (_ _)
      In unknown file:
                 0 (stat "./???.scm" #<undefined>)

      ERROR: In procedure stat:
      In procedure stat: No such file or directory: "./???.scm"
      builder for `/gnu/store/dxg87135zcd6a1c92dlrkyvxlbhfwfld-module-import-compiled.drv' failed with exit code 1

      file-union:
      (. .. ?)

      imported-modules:
      (. .. 猫.scm)

      local-file:
      (. .. 猫.scm)

      plain-file:
      猫

      mixed-text-file:
      猫猫猫

      directory-union:
      (. .. 猫.scm)

Which I think you will agree is far from optimal.  After my fix the output
changes to:

      + '[' -f guix.scm ']'
      + '[' -f gnu.scm ']'
      + cat
      + mkdir -p dir-with-utf8-file
      + cp 猫.scm dir-with-utf8-file/
      + cat
      + guix shell -CWN -D guix glibc-locales -- env LANG=C.UTF-8 ./pre-inst-env guix repl -- ./repro.scm

      computed-file:
      猫

      program-file:
      #!/gnu/store/8kbmn359jqkgsbqgqxnmiryvd9ynz8w7-guile-3.0.9/bin/guile --no-auto-compile
      !#
      ((λ _ (display "猫")))

      scheme-file:
      ((λ _ (display "猫")))

      text-file*:
      猫猫猫

      compiled-modules:
      (. .. 猫.go)

      file-union:
      (. .. 猫)

      imported-modules:
      (. .. 猫.scm)

      local-file:
      (. .. 猫.scm)

      plain-file:
      猫

      mixed-text-file:
      猫猫猫

      directory-union:
      (. .. 猫.scm)

Which is actually what the user would expect.

I also added missing arguments to the documentation.

* guix/gexp.scm (computed-file):  Set LANG to C.UTF-8 by default.
(compiled-modules): Try to `setlocale'.
(gexp->script), (gexp->file): New `locale' argument defaulting to C.UTF-8.
(text-file*): Set output port encoding to UTF-8.
* doc/guix.texi (G-Expressions)[computed-file]: Document the changes.  Use
@var.  Document #:guile.
[gexp->script]: Document #:locale.  Fix default value for #:target.
[gexp->file]: Document #:locale, #:system and #:target.

Change-Id: Ib323b51af88a588b780ff48ddd04db8be7c729fb
---
 doc/guix.texi | 11 +++++++----
 guix/gexp.scm | 24 ++++++++++++++++++------
 2 files changed, 25 insertions(+), 10 deletions(-)
  

Comments

Tomas Volf Oct. 23, 2024, 12:13 a.m. UTC | #1
Hello,

any opinion regarding this patch?  I think it prevents whole class of
annoying bugs, and some forms already had support for it, this just
extend it to all forms.

Have a nice day,
Tomas
  
Janneke Nieuwenhuizen Jan. 10, 2025, 4 p.m. UTC | #2
Tomas Volf writes:

Hi,

> any opinion regarding this patch?  I think it prevents whole class of
> annoying bugs, and some forms already had support for it, this just
> extend it to all forms.

While I don't feel qualified to LGTM this patch, it makes sense to me.

As discussed on IRC <https://logs.guix.gnu.org/guix/2025-01-05.log#134213>
I added the patch to core-packages-team, but also reverted it to avoid a
world rebuild.

Just now I pushed a newlyrebased core-packages-team with the patch
in action, see <https://ci.guix.gnu.org/jobset/core-packages-team>.

Would you like to also keep an eye out for how that works?

Greetings,
Janneke
  
Tomas Volf Jan. 11, 2025, 12:42 a.m. UTC | #3
Janneke Nieuwenhuizen <janneke@gnu.org> writes:

> Tomas Volf writes:
>
> Hi,
>
>> any opinion regarding this patch?  I think it prevents whole class of
>> annoying bugs, and some forms already had support for it, this just
>> extend it to all forms.
>
> While I don't feel qualified to LGTM this patch, it makes sense to me.
>
> As discussed on IRC <https://logs.guix.gnu.org/guix/2025-01-05.log#134213>
> I added the patch to core-packages-team, but also reverted it to avoid a
> world rebuild.

Re-reading the IRC log I have noticed I forgot to say my thanks, so,
thank you :)

>
> Just now I pushed a newlyrebased core-packages-team with the patch
> in action, see <https://ci.guix.gnu.org/jobset/core-packages-team>.
>
> Would you like to also keep an eye out for how that works?

I can take a look from time to time (seems to still be building).  Let
us hope nothing burns down.  I *think* it should not really break
anything, but who knows, Guix is complex beast and I am not that smart.

Have a nice weekend,
Tomas
  
Tomas Volf Jan. 11, 2025, 5:07 p.m. UTC | #4
Janneke Nieuwenhuizen <janneke@gnu.org> writes:

> Would you like to also keep an eye out for how that works?

It seems the evaluation failed, however I have no idea if that is due to
my changes or not.  The first issue I see in the log seems to be:

--8<---------------cut here---------------start------------->8---
building of `/gnu/store/5qizz7ba86rd979xspsw3vi2xpg6gq7b-glibc-headers-mesboot-2.16.0.drv' timed out after 3600 seconds of silence
--8<---------------cut here---------------end--------------->8---

For what it is worth, it fails to me locally as well, but with a
different error:

--8<---------------cut here---------------start------------->8---
build of /gnu/store/n45z6cfa9i3jdh07q1ib6pcbll8j6jn7-libstdc++-boot0-4.9.4.drv failed
View build log at '/var/log/guix/drvs/n4/5z6cfa9i3jdh07q1ib6pcbll8j6jn7-libstdc++-boot0-4.9.4.drv.gz'.
cannot build derivation `/gnu/store/gai3hg9c8qb89qlz8fwrgpscmf74g6c8-gcc-cross-boot0-14.2.0.drv': 1 dependencies couldn't be built
cannot build derivation `/gnu/store/f2di3rzlyqa2xsby7z197wsldvqixq0l-gcc-cross-boot0-wrapped-14.2.0.drv': 1 dependencies couldn't be built
--8<---------------cut here---------------end--------------->8---

Which, looking into the log leads to:

--8<---------------cut here---------------start------------->8---
starting phase `patch-tzdb.cc'
error: in phase 'patch-tzdb.cc': uncaught exception:
system-error "mkstemp!" "~A" ("No such file or directory") (2) 
phase `patch-tzdb.cc' failed after 0.0 seconds
--8<---------------cut here---------------end--------------->8---

I am not sure this is caused by my changes.  When I revert the commit:

--8<---------------cut here---------------start------------->8---
$ git -C guix-proper log -1
commit 178c8707fc795b7612ed493523a2f4ef5a71966d (HEAD -> xx)
Author: Linux User
Date:   Sat Jan 11 16:56:00 2025 +0000

    Revert "gexp: Improve support of Unicode characters."
    
    This reverts commit 3532efb0167dae540d9b968b191aa76c4ec79212.
--8<---------------cut here---------------end--------------->8---

It still fails with the same error.

Any ideas?

Tomas
  
Janneke Nieuwenhuizen Jan. 11, 2025, 6:09 p.m. UTC | #5
Tomas Volf writes:

Hello Tomas,

> Janneke Nieuwenhuizen <janneke@gnu.org> writes:
>
>> Would you like to also keep an eye out for how that works?
>
> It seems the evaluation failed, however I have no idea if that is due to
> my changes or not.  The first issue I see in the log seems to be:
>
> building of
> `/gnu/store/5qizz7ba86rd979xspsw3vi2xpg6gq7b-glibc-headers-mesboot-2.16.0.drv'
> timed out after 3600 seconds of silence

Ah.  I wondered why there was a big red cross instead of a lot of nice
green builds...

> For what it is worth, it fails to me locally as well, but with a
> different error:
>
> build of /gnu/store/n45z6cfa9i3jdh07q1ib6pcbll8j6jn7-libstdc++-boot0-4.9.4.drv failed
> View build log at '/var/log/guix/drvs/n4/5z6cfa9i3jdh07q1ib6pcbll8j6jn7-libstdc++-boot0-4.9.4.drv.gz'.
> cannot build derivation `/gnu/store/gai3hg9c8qb89qlz8fwrgpscmf74g6c8-gcc-cross-boot0-14.2.0.drv': 1 dependencies couldn't be built
> cannot build derivation `/gnu/store/f2di3rzlyqa2xsby7z197wsldvqixq0l-gcc-cross-boot0-wrapped-14.2.0.drv': 1 dependencies couldn't be built
>
>
> Which, looking into the log leads to:
>
> starting phase `patch-tzdb.cc'
> error: in phase 'patch-tzdb.cc': uncaught exception:
> system-error "mkstemp!" "~A" ("No such file or directory") (2) 
> phase `patch-tzdb.cc' failed after 0.0 seconds

Weird!  Oh wait, I removed guards around that stage.

core-packages-team-old has

--8<---------------cut here---------------start------------->8---
          #$@(if (target-hurd64?)
                 #~((add-after 'unpack 'patch-hurd64
                      (lambda _
                        (substitute* "libstdc++-v3/src/c++20/tzdb.cc"
                          (("#if ! defined _GLIBCXX_ZONEINFO_DIR")
                           "#if __GNU__ || ! defined _GLIBCXX_ZONEINFO_DIR")))))
                 '())
          #$@(if (and (target-x86-64?) (target-linux?)
                      (version>=? (package-version gcc) "14"))
                 #~((add-after 'unpack 'patch-x86_64-linux
                      (lambda _
                        (substitute* "libstdc++-v3/src/c++20/tzdb.cc"
                          (("#if ! defined _GLIBCXX_ZONEINFO_DIR")
                           "#if __x86_64__ || ! defined _GLIBCXX_ZONEINFO_DIR")))))
                 '())
          #$@(if (and (target-x86-32?) (target-linux?)
                      (version>=? (package-version gcc) "14"))
                 #~((add-after 'unpack 'patch-x86_64-linux
                      (lambda _
                        (substitute* "libstdc++-v3/src/c++20/tzdb.cc"
                          (("#if ! defined _GLIBCXX_ZONEINFO_DIR")
                           "#if __i386__ || __x86_64__ || ! defined _GLIBCXX_ZONEINFO_DIR")))))
                 '())
          #$@(if (and (target-linux?)
                      (not (target-x86-64?))
                      (not (target-x86-32?))
                      (version>=? (package-version gcc) "14"))
                 #~((add-after 'unpack 'patch-tzdb.cc
                      (lambda _
                        (substitute* "libstdc++-v3/src/c++20/tzdb.cc"
                          (("#if ! defined _GLIBCXX_ZONEINFO_DIR")
                           "#if 1 // ! defined _GLIBCXX_ZONEINFO_DIR")))))
                 '()))
--8<---------------cut here---------------end--------------->8---

and the new core-packages-team has

--8<---------------cut here---------------start------------->8---
          (add-after 'unpack 'patch-tzdb.cc
            (lambda _
              (substitute* "libstdc++-v3/src/c++20/tzdb.cc"
                (("#if ! defined _GLIBCXX_ZONEINFO_DIR")
                 "#if 1 // ! defined _GLIBCXX_ZONEINFO_DIR")))))
--8<---------------cut here---------------end--------------->8---

ow, there it already is.  Silly me, we need the gcc-14 guard.  I was so
happy all systems seemed to need the same code that I also removed the
check for gcc-14.

> I am not sure this is caused by my changes.  When I revert the commit:

Certainly not!  But thanks for trying!

[..]

> Any ideas?

Meanwhile, because it seemed the build farm stopped working for me, I
started to build myself again and currently have

--8<---------------cut here---------------start------------->8---
successfully built /gnu/store/5qizz7ba86rd979xspsw3vi2xpg6gq7b-glibc-headers-mesboot-2.16.0.drv
successfully built /gnu/store/91212rdl4cn4rr8aqfrbilxagmx9fwj3-glibc-mesboot-2.16.0.drv
successfully built /gnu/store/5a0bd35brzf1sgnw10slaxipmxa3cafn-gcc-mesboot1-wrapper-4.6.4.drv
--8<---------------cut here---------------end--------------->8---

so I didn't see this problem yet.  I've pushed a squash! commit that
should fix this.  Thanks!

Greetings,
Janneke
  
Ludovic Courtès Jan. 12, 2025, 3:19 p.m. UTC | #6
Hello,

Tomas Volf <~@wolfsden.cz> skribis:

> * guix/gexp.scm (computed-file):  Set LANG to C.UTF-8 by default.
> (compiled-modules): Try to `setlocale'.
> (gexp->script), (gexp->file): New `locale' argument defaulting to C.UTF-8.
> (text-file*): Set output port encoding to UTF-8.
> * doc/guix.texi (G-Expressions)[computed-file]: Document the changes.  Use
> @var.  Document #:guile.
> [gexp->script]: Document #:locale.  Fix default value for #:target.
> [gexp->file]: Document #:locale, #:system and #:target.
>
> Change-Id: Ib323b51af88a588b780ff48ddd04db8be7c729fb

[...]

>  (define* (computed-file name gexp
> -                        #:key guile (local-build? #t) (options '()))
> +                        #:key
> +                        guile
> +                        (local-build? #t)
> +                        (options '(#:env-vars (("LANG" . "C.UTF-8")))))

I’d suggest LC_CTYPE (or LC_ALL?) rather than LANG.

Also, what about making it the default for the #:env-vars of
‘gexp->derivation’?  That way it wouldn’t need to be repeated in several
places.

> @@ -1700,6 +1703,9 @@ (define* (compiled-modules modules
>                         (system base target)
>                         (system base compile))
>  
> +          ;; Best effort.  The locale is not installed in all contexts.
> +          (false-if-exception (setlocale LC_ALL "C.UTF-8"))

Sounds good.  I would make it a separate patch.

s/in all contexts/when cross-compiling/

> @@ -1990,7 +1996,8 @@ (define* (gexp->script name exp
>                         #:key (guile (default-guile))
>                         (module-path %load-path)
>                         (system (%current-system))
> -                       (target 'current))
> +                       (target 'current)
> +                       (locale "C.UTF-8"))

I would remove this argument and instead add an explicit, hard-coded:

  (set-port-encoding! port "UTF-8")

in the body of ‘call-with-output-file’ here, just like you did below.

>  (define* (text-file* name #:rest text)
>    "Return as a monadic value a derivation that builds a text file containing
> @@ -2108,6 +2119,7 @@ (define* (text-file* name #:rest text)
>    (define builder
>      (gexp (call-with-output-file (ungexp output "out")
>              (lambda (port)
> +              (set-port-encoding! port "UTF-8")
>                (display (string-append (ungexp-splicing text)) port)))))

LGTM.  This can be moved to a separate file.

How does that sound?

Apologies for not replying earlier!

Ludo’.
  
Tomas Volf Jan. 23, 2025, 10:58 p.m. UTC | #7
Ludovic Courtès <ludo@gnu.org> writes:

> Hello,
>
> Tomas Volf <~@wolfsden.cz> skribis:
>
>> * guix/gexp.scm (computed-file):  Set LANG to C.UTF-8 by default.
>> (compiled-modules): Try to `setlocale'.
>> (gexp->script), (gexp->file): New `locale' argument defaulting to C.UTF-8.
>> (text-file*): Set output port encoding to UTF-8.
>> * doc/guix.texi (G-Expressions)[computed-file]: Document the changes.  Use
>> @var.  Document #:guile.
>> [gexp->script]: Document #:locale.  Fix default value for #:target.
>> [gexp->file]: Document #:locale, #:system and #:target.
>>
>> Change-Id: Ib323b51af88a588b780ff48ddd04db8be7c729fb
>
> [...]
>
>>  (define* (computed-file name gexp
>> -                        #:key guile (local-build? #t) (options '()))
>> +                        #:key
>> +                        guile
>> +                        (local-build? #t)
>> +                        (options '(#:env-vars (("LANG" . "C.UTF-8")))))
>
> I’d suggest LC_CTYPE (or LC_ALL?) rather than LANG.

Oh, yeah, you are right, after reading the specification (and verifying
Guile takes the variable into account), LC_CTYPE seems like a better
fit.

>
> Also, what about making it the default for the #:env-vars of
> ‘gexp->derivation’?  That way it wouldn’t need to be repeated in several
> places.

I *think* the original motivation was to keep the gexp->derivation as
impartial as possible, since I do not know what people are using it for.
But it is some time since I wrote this, so I am not fully sure.  There
are few downsides.  Testing any changes now takes a long time since I
need to do full bootstrap due to changing gexp->derivation and there are
(I presume new and harmless) warnings `warning: failed to install
locale: Invalid argument' during the bootstrap.

But the change itself is much more localized now and the Unicode support
more likely to "just work" for any new gexp forms added in the future.
So maybe you are right and it is a right way.

Anyway, I followed the suggestion and v2 moves the LC_CTYPE setting to
gexp->derivation.  The test script from the commit message still (after
many hours of bootstrapping) works.

>
>> @@ -1700,6 +1703,9 @@ (define* (compiled-modules modules
>>                         (system base target)
>>                         (system base compile))
>>
>> +          ;; Best effort.  The locale is not installed in all contexts.
>> +          (false-if-exception (setlocale LC_ALL "C.UTF-8"))
>
> Sounds good.  I would make it a separate patch.

Somewhat done.  I have made it a separate commit, but still included in
v2.

>
> s/in all contexts/when cross-compiling/

Interesting, I have modified the comment, however would you be willing
to expand on this a bit?  Why is the C.UTF-8 locale not available when
cross-compiling?  The Guile running this script runs on the build host,
using build host's glibc and build host's locale definitions no?  So I
assumed the locale *should* be available.  I feel like I am missing
something fundamental about how Guix works here.

>
>> @@ -1990,7 +1996,8 @@ (define* (gexp->script name exp
>>                         #:key (guile (default-guile))
>>                         (module-path %load-path)
>>                         (system (%current-system))
>> -                       (target 'current))
>> +                       (target 'current)
>> +                       (locale "C.UTF-8"))
>
> I would remove this argument and instead add an explicit, hard-coded:
>
>   (set-port-encoding! port "UTF-8")
>
> in the body of ‘call-with-output-file’ here, just like you did below.

Done.

>
>>  (define* (text-file* name #:rest text)
>>    "Return as a monadic value a derivation that builds a text file containing
>> @@ -2108,6 +2119,7 @@ (define* (text-file* name #:rest text)
>>    (define builder
>>      (gexp (call-with-output-file (ungexp output "out")
>>              (lambda (port)
>> +              (set-port-encoding! port "UTF-8")
>>                (display (string-append (ungexp-splicing text)) port)))))
>
> LGTM.  This can be moved to a separate file.

By "separate file" you mean separate patch and/or commit?

>
> How does that sound?
>
> Apologies for not replying earlier!

No worries, thank you for finding the time to look at this. ^_^  The v2
is much smaller.

Tomas
  
Ludovic Courtès Jan. 24, 2025, 2:16 p.m. UTC | #8
Hi Tomas!

Tomas Volf <~@wolfsden.cz> skribis:

> Anyway, I followed the suggestion and v2 moves the LC_CTYPE setting to
> gexp->derivation.  The test script from the commit message still (after
> many hours of bootstrapping) works.

Sounds good!

>>> +          ;; Best effort.  The locale is not installed in all contexts.
>>> +          (false-if-exception (setlocale LC_ALL "C.UTF-8"))
>>
>> Sounds good.  I would make it a separate patch.
>
> Somewhat done.  I have made it a separate commit, but still included in
> v2.
>
>>
>> s/in all contexts/when cross-compiling/
>
> Interesting, I have modified the comment, however would you be willing
> to expand on this a bit?  Why is the C.UTF-8 locale not available when
> cross-compiling?  The Guile running this script runs on the build host,
> using build host's glibc and build host's locale definitions no?  So I
> assumed the locale *should* be available.  I feel like I am missing
> something fundamental about how Guix works here.

See the ‘install-utf8-c-locale’ phase of ‘glibc’: since there’s no
‘localedef’ program to use when cross-compiling, that phase is a no-op.

             ;; FIXME: When cross-compiling, attempt to use
             ;; 'localedef' from the same libc version.
             (invoke ,(if (%current-target-system)
                          "true"
                          '(string-append bin "/localedef"))
                     "--no-archive" "--prefix" locale
                     "-i" "C" "-f" "UTF-8"
                     (string-append locale "/C.UTF-8"))

It’s a problem because then we cannot really assume that C.UTF-8 is
*always* available, even though that was the goal (see
<https://issues.guix.gnu.org/67686>.)

AFAICS, the format is actually architecture-independent:

--8<---------------cut here---------------start------------->8---
$ guix build -e '(@@ (gnu packages base) glibc-utf8-locales)' -s i686-linux
/gnu/store/4v8xzpkkkzpkm6qmxjv1lcma69chdwkl-glibc-utf8-locales-2.39
$ guix build -e '(@@ (gnu packages base) glibc-utf8-locales)' 
/gnu/store/f3kpp3dh25893y79n32qlqqiwb5my4zg-glibc-utf8-locales-2.39
$ diff -r /gnu/store/4v8xzpkkkzpkm6qmxjv1lcma69chdwkl-glibc-utf8-locales-2.39/lib/locale/2.39/C.utf8  /gnu/store/f3kpp3dh25893y79n32qlqqiwb5my4zg-glibc-utf8-locales-2.39/lib/locale/2.39/C.utf8
$ echo $?
0
$ guix build -e '(@@ (gnu packages base) glibc-utf8-locales)' -s aarch64-linux
/gnu/store/0i1brwncg9rpf7pvh9hs5vrchmfb3c9q-glibc-utf8-locales-2.39
$ guix build -e '(@@ (gnu packages base) glibc-utf8-locales)' -s armhf-linux
/gnu/store/8x26ik4jlvljcnx1jhfd83r5lyx04d15-glibc-utf8-locales-2.39
$ diff -r /gnu/store/4v8xzpkkkzpkm6qmxjv1lcma69chdwkl-glibc-utf8-locales-2.39/lib/locale/2.39/C.utf8  /gnu/store/0i1brwncg9rpf7pvh9hs5vrchmfb3c9q-glibc-utf8-locales-2.39/lib/locale/2.39/C.utf8
$ echo $?
0
$ diff -r /gnu/store/4v8xzpkkkzpkm6qmxjv1lcma69chdwkl-glibc-utf8-locales-2.39/lib/locale/2.39/C.utf8  /gnu/store/8x26ik4jlvljcnx1jhfd83r5lyx04d15-glibc-utf8-locales-2.39/lib/locale/2.39/C.utf8
$ echo $?
0
--8<---------------cut here---------------end--------------->8---

So it might work by replacing "true" by "localedef" in this phase (and
we could do that on ‘master’), under the assumption that the
cross-compiled libc and the one that provides the ‘localedef’ command
are the same version, which is usually the case.

We should do that.

>>>  (define* (text-file* name #:rest text)
>>>    "Return as a monadic value a derivation that builds a text file containing
>>> @@ -2108,6 +2119,7 @@ (define* (text-file* name #:rest text)
>>>    (define builder
>>>      (gexp (call-with-output-file (ungexp output "out")
>>>              (lambda (port)
>>> +              (set-port-encoding! port "UTF-8")
>>>                (display (string-append (ungexp-splicing text)) port)))))
>>
>> LGTM.  This can be moved to a separate file.
>
> By "separate file" you mean separate patch and/or commit?

Yes.

Thanks!

Ludo’.
  
Janneke Nieuwenhuizen Jan. 24, 2025, 4:08 p.m. UTC | #9
Tomas Volf writes:

Hello Tomas,

Thank you for v2.  On IRC, civodul writes

<https://logs.guix.gnu.org/guix/2025-01-24.log#151823>
--8<---------------cut here---------------start------------->8---
<civodul> janneke: re gexp/Unicode, the 3 patches LGTM so yes, feel free to
          apply them!  [15:18]
--8<---------------cut here---------------end--------------->8---

so I'm applied this series to a freshly rebased core-packages-team.  The
only changes I made was to add

--8<---------------cut here---------------start------------->8---
diff --git a/guix/gexp.scm b/guix/gexp.scm
@@ -5,6 +5,7 @@
 ;;; Copyright © 2019, 2020 Mathieu Othacehe <m.othacehe@gmail.com>
 ;;; Copyright © 2020 Maxim Cournoyer <maxim.cournoyer@gmail.com>
 ;;; Copyright © 2021, 2022 Maxime Devos <maximedevos@telenet.be>
+;;; Copyright © 2025 Tomas Volf <~@wolfsden.cz>
 ;;;
 ;;; This file is part of GNU Guix.
 ;;;
--8<---------------cut here---------------end--------------->8---

/update your copyright

--8<---------------cut here---------------start------------->8---
diff --git a/doc/guix.texi b/doc/guix.texi
@@ -123,7 +123,7 @@
 Copyright @copyright{} 2023 Thomas Ieong@*
 Copyright @copyright{} 2023 Saku Laesvuori@*
 Copyright @copyright{} 2023 Graham James Addis@*
-Copyright @copyright{} 2023, 2024 Tomas Volf@*
+Copyright @copyright{} 2023, 2024, 2025 Tomas Volf@*
 Copyright @copyright{} 2024 Herman Rimm@*
 Copyright @copyright{} 2024 Matthew Trzcinski@*
 Copyright @copyright{} 2024 Richard Sent@*
--8<---------------cut here---------------end--------------->8---

Thank you!

Greetings,
Janneke
  

Patch

diff --git a/doc/guix.texi b/doc/guix.texi
index 52e36e4354..683ba2f44b 100644
--- a/doc/guix.texi
+++ b/doc/guix.texi
@@ -12270,7 +12270,9 @@  G-Expressions
 This is the declarative counterpart of @code{text-file}.
 @end deffn
 
-@deffn {Procedure} computed-file name gexp [#:local-build? #t] [#:options '()]
+@deffn {Procedure} computed-file @var{name} @var{gexp} @
+  [#:local-build? #t] [#:guile] @
+  [#:options '(#:env-vars (("LANG" . "C.UTF-8")))]
 Return an object representing the store item @var{name}, a file or
 directory computed by @var{gexp}.  When @var{local-build?} is true (the
 default), the derivation is built locally.  @var{options} is a list of
@@ -12281,7 +12283,7 @@  G-Expressions
 
 @deffn {Monadic Procedure} gexp->script @var{name} @var{exp} @
   [#:guile (default-guile)] [#:module-path %load-path] @
-  [#:system (%current-system)] [#:target #f]
+  [#:system (%current-system)] [#:target 'current] [#:locale "C.UTF-8"]
 Return an executable script @var{name} that runs @var{exp} using
 @var{guile}, with @var{exp}'s imported modules in its search path.
 Look up @var{exp}'s modules in @var{module-path}.
@@ -12318,8 +12320,9 @@  G-Expressions
 
 @deffn {Monadic Procedure} gexp->file @var{name} @var{exp} @
             [#:set-load-path? #t] [#:module-path %load-path] @
-            [#:splice? #f] @
-            [#:guile (default-guile)]
+            [#:splice? #f] [#:guile (default-guile)] @
+            [#:system (%current-system)] [#:target 'current] @
+            [#:locale "C.UTF-8"]
 Return a derivation that builds a file @var{name} containing @var{exp}.
 When @var{splice?}  is true, @var{exp} is considered to be a list of
 expressions that will be spliced in the resulting file.
diff --git a/guix/gexp.scm b/guix/gexp.scm
index e44aea6420..c8aba91779 100644
--- a/guix/gexp.scm
+++ b/guix/gexp.scm
@@ -597,7 +597,10 @@  (define-record-type <computed-file>
   (options    computed-file-options))             ;list of arguments
 
 (define* (computed-file name gexp
-                        #:key guile (local-build? #t) (options '()))
+                        #:key
+                        guile
+                        (local-build? #t)
+                        (options '(#:env-vars (("LANG" . "C.UTF-8")))))
   "Return an object representing the store item NAME, a file or directory
 computed by GEXP.  When LOCAL-BUILD? is #t (the default), it ensures the
 corresponding derivation is built locally.  OPTIONS may be used to pass
@@ -1700,6 +1703,9 @@  (define* (compiled-modules modules
                        (system base target)
                        (system base compile))
 
+          ;; Best effort.  The locale is not installed in all contexts.
+          (false-if-exception (setlocale LC_ALL "C.UTF-8"))
+
           (define modules
             (getenv "modules"))
 
@@ -1990,7 +1996,8 @@  (define* (gexp->script name exp
                        #:key (guile (default-guile))
                        (module-path %load-path)
                        (system (%current-system))
-                       (target 'current))
+                       (target 'current)
+                       (locale "C.UTF-8"))
   "Return an executable script NAME that runs EXP using GUILE, with EXP's
 imported modules in its search path.  Look up EXP's modules in MODULE-PATH."
   (mlet* %store-monad ((target (if (eq? target 'current)
@@ -2033,7 +2040,8 @@  (define* (gexp->script name exp
                       ;; These derivations are not worth offloading or
                       ;; substituting.
                       #:local-build? #t
-                      #:substitutable? #f)))
+                      #:substitutable? #f
+                      #:env-vars `(("LANG" . ,locale)))))
 
 (define* (gexp->file name exp #:key
                      (guile (default-guile))
@@ -2041,7 +2049,8 @@  (define* (gexp->file name exp #:key
                      (module-path %load-path)
                      (splice? #f)
                      (system (%current-system))
-                     (target 'current))
+                     (target 'current)
+                     (locale "C.UTF-8"))
   "Return a derivation that builds a file NAME containing EXP.  When SPLICE?
 is true, EXP is considered to be a list of expressions that will be spliced in
 the resulting file.
@@ -2081,7 +2090,8 @@  (define* (gexp->file name exp #:key
                           #:local-build? #t
                           #:substitutable? #f
                           #:system system
-                          #:target target)
+                          #:target target
+                          #:env-vars `(("LANG" . ,locale)))
         (gexp->derivation name
                           (gexp
                            (call-with-output-file (ungexp output)
@@ -2098,7 +2108,8 @@  (define* (gexp->file name exp #:key
                           #:local-build? #t
                           #:substitutable? #f
                           #:system system
-                          #:target target))))
+                          #:target target
+                          #:env-vars `(("LANG" . ,locale))))))
 
 (define* (text-file* name #:rest text)
   "Return as a monadic value a derivation that builds a text file containing
@@ -2108,6 +2119,7 @@  (define* (text-file* name #:rest text)
   (define builder
     (gexp (call-with-output-file (ungexp output "out")
             (lambda (port)
+              (set-port-encoding! port "UTF-8")
               (display (string-append (ungexp-splicing text)) port)))))
 
   (gexp->derivation name builder