% \iffalse meta-comment % %% File: l3text-utils.dtx % % Copyright (C) 2026 The LaTeX Project % % It may be distributed and/or modified under the conditions of the % LaTeX Project Public License (LPPL), either version 1.3c of this % license or (at your option) any later version. The latest version % of this license is in the file % % https://www.latex-project.org/lppl.txt % % This file is part of the "l3kernel bundle" (The Work in LPPL) % and all files in that bundle must be distributed together. % % ----------------------------------------------------------------------- % % The development version of the bundle can be found at % % https://github.com/latex3/latex3 % % for those people who are interested. % %<*driver> \documentclass[full,kernel]{l3doc} \begin{document} \DocInput{\jobname.dtx} \end{document} % % \fi % % \title{^^A % The \pkg{l3text-utils} module\\ Text processing (support utilities)^^A % } % % \author{^^A % The \LaTeX{} Project\thanks % {^^A % E-mail: % \href{mailto:latex-team@latex-project.org} % {latex-team@latex-project.org}^^A % }^^A % } % % \date{Released 2026-03-16} % % \maketitle % % \begin{documentation} % % \end{documentation} % % \begin{implementation} % % \section{\pkg{l3text-utils} implementation} % % \begin{macrocode} %<*code> % \end{macrocode} % % \subsection{Parsing BCP~47 strings} % % \begin{macrocode} %<@@=text_bcp> % \end{macrocode} % % For a reference implementation in JavaScript, see % \url{https://github.com/wooorm/bcp-47/}. This gives clear details of the % overall algorithm needed. % % \begin{variable}{\c_@@_normal_prop} % There are a small number of non-standard tags which are grandfathered into % the current standard. Here, we set up a mapping to the equivalent standard % version, which allows us to avoid complexity. The commented lines have % no equivalent I can track down at the moment! The \texttt{prop} is made % into the linked form as this gains efficiency in the lookup. % \begin{macrocode} \prop_const_from_keyval:Nn \c_@@_normal_prop { en-gb-oed = en-gb-oxendict , i-ami = ami , i-bnn = bnn , % i-default = , % i-enochian = , i-hak = hak , i-klingon = tlh , i-lux = lb , % i-mingo = , i-navajo = nv , i-pwn = pwn , i-tao = tao , i-tay = tay , i-tsu = tsu , sgn-be-fr = sfb , sgn-be-nl = vgt , sgn-ch-de = sgg , art-lojban = jbo , % cel-gaulish = , no-bok = nb , no-nyn = nn , zh-guoyu = cmn , zh-hakka = hak , % zh-min = , zh-min-nan = nan , zh-xiang = hsn } \prop_make_linked:N \c_@@_normal_prop % \end{macrocode} % \end{variable} % % \begin{macro}[EXP]{\text_bcp_parse:n} % \begin{macro}[EXP]{\@@_parse_auxi:n} % \begin{macro}[EXP]{\@@_parse_auxii:nn} % \begin{macro}[EXP]{\@@_parse_auxiii:n} % \begin{macro}[EXP]{\@@_parse_auxiv:w} % \begin{macro}[EXP]{\@@_parse_auxv:w} % \begin{macro}[EXP]{\@@_parse_auxvi:n} % \begin{macro}[EXP]{\@@_parse_auxvii:NNNN} % \begin{macro}[EXP]{\@@_parse_extlang:n} % \begin{macro}[EXP]{\@@_parse_extlang:nw} % \begin{macro}[EXP]{\@@_parse_extlang:nn} % \begin{macro}[EXP]{\@@_parse_script:n} % \begin{macro}[EXP]{\@@_parse_script:w} % \begin{macro}[EXP]{\@@_parse_region:n} % \begin{macro}[EXP]{\@@_parse_region:w} % \begin{macro}[EXP]{\@@_parse_variant_chk:n} % \begin{macro}[EXP]{\@@_parse_variant_chk:NNNN} % \begin{macro}[EXP]{\@@_parse_variant:n} % \begin{macro}[EXP]{\@@_parse_variant:nw} % \begin{macro}[EXP]{\@@_parse_variant:nn} % \begin{macro}[EXP]{\@@_parse_variant_chk:nn} % \begin{macro}[EXP]{\@@_parse_variant_chk:NNNNn} % \begin{macro}[EXP]{\@@_parse_variant_end:n} % \begin{macro}[EXP]{\@@_parse_ext:n} % \begin{macro}[EXP]{\@@_parse_ext:nN} % \begin{macro}[EXP]{\@@_parse_ext:nNnw} % \begin{macro}[EXP]{\@@_parse_private:nw} % \begin{macro}[EXP]{\@@_parse_count:n} % \begin{macro}[EXP]{\@@_parse_count_auxi:w} % \begin{macro}[EXP]{\@@_parse_count_auxii:w} % \begin{macro}[EXP]{\@@_parse_count_auxiii:w} % \begin{macro}[EXP]{\@@_parse_count_auxiv:N} % Before we get to the business end of the parse, we need to deal with the % special cases: entirely blank input or one of the non-standard inputs % above. We also want to deal with a string not a token list, but do that % once any replacement is sorted. % \begin{macrocode} \cs_new:Npn \text_bcp_parse:n #1 { \tl_if_blank:nTF {#1} { \msg_expandable_error:nn { text } { bcp-blank } } { \exp_args:Ne \@@_parse_auxi:n { \str_casefold:n {#1} } } } \cs_new:Npn \@@_parse_auxi:n #1 { \exp_args:Ne \@@_parse_auxii:nn { \prop_item:Nn \c_@@_normal_prop {#1} } {#1} } \cs_new:Npn \@@_parse_auxii:nn #1#2 { \tl_if_blank:nTF {#1} { \@@_parse_auxiii:n {#2} } { \@@_parse_auxiii:n {#1} } } % \end{macrocode} % The main loop is set up to allow us to iterate over each block, separated % by a |-| token. The first block \emph{must} be the language, which can be % either two or three characters: this is an easy test. There doesn't have % to be any other input, so things could well stop here. If not, we need % to know how many characters are in the next block to proceed. % \begin{macrocode} \cs_new:Npn \@@_parse_auxiii:n #1 { \@@_parse_auxiv:w #1 - \q_recursion_tail - \q_recursion_stop } \cs_new:Npn \@@_parse_auxiv:w #1 - { \int_compare:nTF { 1 < \@@_parse_count:n {#1} < 4 } { {#1} \@@_parse_auxv:w } { \msg_expandable_error:nn { text } { bcp-invalid-lang } } } % \end{macrocode} % We will see versions of this several times. We know that there are a number % of valid subtag types at this point, differentiated by their length. (A % length of zero is never valid, but we do not special case it in the % counting code as it's quite unlikely.) There's therefore a split to choose % the appropriate subtag parser. % \begin{macrocode} \cs_new:Npn \@@_parse_auxv:w #1 - { \quark_if_recursion_tail_stop_do:nn {#1} { { } { } { } { } { } { } } \int_case:nnF { \@@_parse_count:n {#1} } { { 1 } { { } { } { } { } \@@_parse_ext:n } { 2 } { { } { } \@@_parse_region:n } { 3 } { \@@_parse_extlang:n } { 4 } { { } \@@_parse_auxvi:n } { 5 } { { } { } { } \@@_parse_variant:n } { 6 } { { } { } { } \@@_parse_variant:n } { 7 } { { } { } { } \@@_parse_variant:n } { 8 } { { } { } { } \@@_parse_variant:n } } { \msg_expandable_error:nn { text } { bcp-invalid-subtag } \use_none_delimit_by_q_recursion_stop:w } {#1} } % \end{macrocode} % There is one case that cannot be determined purely on block length. % Both variants and scripts can be made up of four characters, but for % variants the first character has to be a digit. % \begin{macrocode} \cs_new:Npn \@@_parse_auxvi:n #1 { \@@_parse_auxvii:NNNN #1 } \cs_new:Npn \@@_parse_auxvii:NNNN #1#2#3#4 { \bool_lazy_or:nnTF { \int_compare_p:nNn {`#1} < { `0 } } { \int_compare_p:nNn {`#1} > { `9 } } { \@@_parse_script:n } { { } { } \@@_parse_variant:n } {#1#2#3#4} } % \end{macrocode} % The first block allowed after the language is \enquote{extended language}, % which can have up to three entries of three characters. % \begin{macrocode} \cs_new:Npn \@@_parse_extlang:n #1 { \@@_parse_extlang:nw { {#1} } } \cs_new:Npn \@@_parse_extlang:nw #1#2 - { \quark_if_recursion_tail_stop_do:nn {#2} { {#1} { } { } { } { } { } } \int_case:nnF { \@@_parse_count:n {#2} } { { 1 } { { {#1} } { } { } { } \@@_parse_ext:n } { 2 } { { {#1} } { } \@@_parse_region:n } { 3 } { \@@_parse_extlang:nn {#1} } { 4 } { { {#1} } \@@_parse_auxvi:n } { 5 } { { {#1} } { } { } \@@_parse_variant:n } { 6 } { { {#1} } { } { } \@@_parse_variant:n } { 7 } { { {#1} } { } { } \@@_parse_variant:n } { 8 } { { {#1} } { } { } \@@_parse_variant:n } } { \msg_expandable_error:nn { text } { bcp-invalid-subtag } \use_none_delimit_by_q_recursion_stop:w } {#2} } \cs_new:Npn \@@_parse_extlang:nn #1#2 { \int_compare:nNnTF { \tl_count:n {#1} } = 3 { \msg_expandable_error:nn { text } { bcp-invalid-subtag } \use_none_delimit_by_q_recursion_stop:w } { \@@_parse_extlang:nw { #1 {#2} } } } % \end{macrocode} % The next valid block is a script: a single entry so not a lot to do. % \begin{macrocode} \cs_new:Npn \@@_parse_script:n #1 { {#1} \@@_parse_script:w } \cs_new:Npn \@@_parse_script:w #1 - { \quark_if_recursion_tail_stop_do:nn {#1} { { } { } { } { } } \int_case:nnF { \@@_parse_count:n {#1} } { { 1 } { { } { } \@@_parse_ext:n } { 2 } { \@@_parse_region:n } { 4 } { { } \@@_parse_variant_chk:n } { 5 } { { } \@@_parse_variant:n } { 6 } { { } \@@_parse_variant:n } { 7 } { { } \@@_parse_variant:n } { 8 } { { } \@@_parse_variant:n } } { \msg_expandable_error:nn { text } { bcp-invalid-subtag } \use_none_delimit_by_q_recursion_stop:w } {#1} } % \end{macrocode} % Much the same story for the region: a single block with simply fewer % possible blocks after it. % \begin{macrocode} \cs_new:Npn \@@_parse_region:n #1 { {#1} \@@_parse_region:w } \cs_new:Npn \@@_parse_region:w #1 - { \quark_if_recursion_tail_stop_do:nn {#1} { { } { } { } } \int_case:nnF { \@@_parse_count:n {#1} } { { 1 } { { } \@@_parse_ext:n } { 4 } { \@@_parse_variant_chk:n } { 5 } { \@@_parse_variant:n } { 6 } { \@@_parse_variant:n } { 7 } { \@@_parse_variant:n } { 8 } { \@@_parse_variant:n } } { \msg_expandable_error:nn { text } { bcp-invalid-subtag } \use_none_delimit_by_q_recursion_stop:w } {#1} } % \end{macrocode} % The same idea about a four-character block as we've already seen: to be a % valid variant, it has to start with a digit. Unlike the earlier version, at % this stage a script is not allowed, so anything except a leading digit is % an error. % \begin{macrocode} \cs_new:Npn \@@_parse_variant_chk:n #1 { \@@_parse_variant_chk:NNNN #1 } \cs_new:Npn \@@_parse_variant_chk:NNNN #1#2#3#4 { \bool_lazy_or:nnTF { \int_compare_p:nNn {`#1} < { `0 } } { \int_compare_p:nNn {`#1} > { `9 } } { \msg_expandable_error:nn { text } { bcp-invalid-subtag } \use_none_delimit_by_q_recursion_stop:w } { \@@_parse_variant:n } {#1#2#3#4} } % \end{macrocode} % Variants form an open-ended list so a loop is required to handle this. % At each step, the length of the next block (if present) needs to checked: % if it's a valid variant, keep collecting, otherwise it's a extension or % an error. % \begin{macrocode} \cs_new:Npn \@@_parse_variant:n #1 { \@@_parse_variant:nw { {#1} } } \cs_new:Npn \@@_parse_variant:nw #1#2 - { \quark_if_recursion_tail_stop_do:nn {#2} { {#1} { } { } } \int_case:nnF { \@@_parse_count:n {#2} } { { 1 } { \@@_parse_variant_end:nn } { 4 } { \@@_parse_variant_chk:nn } { 5 } { \@@_parse_variant:nn } { 6 } { \@@_parse_variant:nn } { 7 } { \@@_parse_variant:nn } { 8 } { \@@_parse_variant:nn } } { \msg_expandable_error:nn { text } { bcp-invalid-subtag } \use_none_delimit_by_q_recursion_stop:w } {#1} {#2} } \cs_new:Npn \@@_parse_variant:nn #1#2 { \@@_parse_variant:nw { #2 {#1} } } \cs_new:Npn \@@_parse_variant_chk:nn #1#2 { \@@_parse_variant_chk:NNNNn #1 {#2} } \cs_new:Npn \@@_parse_variant_chk:NNNNn #1#2#3#4 { \bool_lazy_or:nnTF { \int_compare_p:nNn {`#1} < { `0 } } { \int_compare_p:nNn {`#1} > { `9 } } { \msg_expandable_error:nn { text } { bcp-invalid-subtag } \use_none_delimit_by_q_recursion_stop:w } { \@@_parse_variant:nn } {#1#2#3#4} {#2} } \cs_new:Npn \@@_parse_variant_end:nn #1#2 { {#2} \@@_parse_ext:n {#1} } % \end{macrocode} % There are only three possible valid one-letter blocks: the extensions % |t| and |u|, and the private use marker |x|. All of these then allow an % open-ended set of subtags, the only restriction being these cannot be % one-letter other than after |x|. So we need to collect up quite a bit % of information whilst allowing for the fact that only one |u| or |t| should % occur. % \begin{macrocode} \cs_new:Npn \@@_parse_ext:n #1 { \str_if_eq:nnTF {#1} { x } { { } \@@_parse_private:nw { } } { \@@_parse_ext:nN { } #1 } } % \end{macrocode} % Test for a valid letter, then start collecting up or switch to the private % use area. Each extension can only be given once, and the comparison needs % to be case-insensitive, so there is a little work to do. % \begin{macrocode} \cs_new:Npn \@@_parse_ext:nN #1#2 { \bool_lazy_or:nnTF { \str_if_eq_p:nn {#2} { t } } { \str_if_eq_p:nn {#2} { u } } { \bool_lazy_or:nnTF { \tl_if_head_eq_charcode_p:nN {#1} #2 } { \int_compare_p:nNn { \tl_count:n {#1} } > 2 } { \msg_expandable_error:nn { text } { bcp-invalid-subtag } \use_none_delimit_by_q_recursion_stop:w } { \@@_parse_ext:nNnw {#1} #2 { } } } { \str_if_eq:nnTF {#2} { x } { {#1} \@@_parse_private:nw { } } { \msg_expandable_error:nn { text } { bcp-invalid-subtag } \use_none_delimit_by_q_recursion_stop:w } } } % \end{macrocode} % The loop for extensions: largely just collection with a test in case we % find another extension or private use marker. % \begin{macrocode} \cs_new:Npn \@@_parse_ext:nNnw #1#2#3#4 - { \quark_if_recursion_tail_stop_do:nn {#4} { \str_if_empty:nTF {#3} { \msg_expandable_error:nn { text } { bcp-invalid-subtag } } { { #1 #2 {#3} } { } } } \int_compare:nTF { 1 < \@@_parse_count:n {#4} < 9 } { \@@_parse_ext:nNnw {#1} #2 { #3 {#4} } } { \int_compare:nNnTF { \@@_parse_count:n {#4} } = 1 { \str_if_empty:nTF {#3} { \msg_expandable_error:nn { text } { bcp-invalid-subtag } \use_none_delimit_by_q_recursion_stop:w } { \use:e { \exp_not:n { \@@_parse_ext:nN { #1 #2 {#3} } } \str_casefold:n {#4} } } } { \msg_expandable_error:nn { text } { bcp-invalid-subtag } \use_none_delimit_by_q_recursion_stop:w } } } % \end{macrocode} % Private use area: all bets are off! The only tests here is we need at least % one subtag, and they all need to be shorter than nine characters. % \begin{macrocode} \cs_new:Npn \@@_parse_private:nw #1#2 - { \quark_if_recursion_tail_stop_do:nn {#2} { \str_if_empty:nTF {#1} { \msg_expandable_error:nn { text } { bcp-invalid-subtag } } { {#1} } } \int_compare:nTF { 0 < \@@_parse_count:n {#2} < 9 } { \@@_parse_private:nw { #1 {#2} } } { \msg_expandable_error:nn { text } { bcp-invalid-subtag } \use_none_delimit_by_q_recursion_stop:w } } % \end{macrocode} % As BCP~47 is largely specified in terms of number of characters, there is % a need to count up repeatedly. Whilst \cs{tl_count:n} is reasonably fast, % the predictable nature of the input here means we can use a slightly % more focussed approach. We know that input can never be more than % 8~characters, so can test for that, then get the character number by % a simple expansion. This saves around half the tracing lines for typical % input lengths. % \begin{macrocode} \cs_new:Npn \@@_parse_count:n #1 { \@@_parse_count_auxi:w #1 \q_nil \q_nil \q_nil \q_nil \q_nil \q_nil \q_nil \q_nil \q_nil \q_stop {#1} } \cs_new:Npn \@@_parse_count_auxi:w #1#2#3#4#5#6#7#8#9 { \quark_if_nil:NTF #9 { \@@_parse_count_auxii:w } { \msg_expandable_error:nn { text } { bcp-invalid-lang } } } \cs_new:Npn \@@_parse_count_auxii:w #1 \q_stop #2 { \@@_parse_count_auxiii:w #2 876543210 \q_stop } \cs_new:Npn \@@_parse_count_auxiii:w #1#2#3#4#5#6#7#8#9 { \@@_parse_count_auxiv:N #9 } \cs_new:Npn \@@_parse_count_auxiv:N #1#2 \q_stop {#1} % \end{macrocode} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % % \begin{macrocode} \msg_new:nnn { text } { bcp-blank } { Empty~input~for~BCP~47~decoding. } \msg_new:nnn { text } { bcp-invalid-lang } { Invalid~language~in~BCP~input. } \msg_new:nnn { text } { bcp-invalid-subtag } { Invalid~subtag~in~BCP~input. } % \end{macrocode} % % \begin{macrocode} % % \end{macrocode} % % \end{implementation} % % \PrintIndex