-
-
Notifications
You must be signed in to change notification settings - Fork 1.8k
paste: support multi-byte delimiters and GNU escape sequences #10840
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,140 @@ | ||
| // This file is part of the uutils coreutils package. | ||
| // | ||
| // For the full copyright and license information, please view the LICENSE | ||
| // file that was distributed with this source code. | ||
|
|
||
| // spell-checker:ignore langinfo charmap eucjp euckr euctw CTYPE HKSCS hkscs localedata | ||
|
|
||
| //! Locale-aware multi-byte character length detection via `LC_CTYPE`. | ||
|
|
||
| use std::sync::OnceLock; | ||
|
|
||
| enum MbEncoding { | ||
| Utf8, | ||
| Gb18030, | ||
| EucJp, | ||
| EucKr, | ||
| Big5, | ||
| } | ||
|
|
||
| fn encoding_from_name(enc: &str) -> MbEncoding { | ||
| match enc { | ||
| "gb18030" | "gbk" | "gb2312" => MbEncoding::Gb18030, | ||
| "euc-jp" | "eucjp" => MbEncoding::EucJp, | ||
| "euc-kr" | "euckr" => MbEncoding::EucKr, | ||
| "big5" | "big5-hkscs" | "big5hkscs" | "euc-tw" | "euctw" => MbEncoding::Big5, | ||
| _ => MbEncoding::Utf8, | ||
| } | ||
| } | ||
|
|
||
| fn get_encoding() -> &'static MbEncoding { | ||
| static ENCODING: OnceLock<MbEncoding> = OnceLock::new(); | ||
| ENCODING.get_or_init(|| { | ||
| let val = ["LC_ALL", "LC_CTYPE", "LANG"] | ||
| .iter() | ||
| .find_map(|&k| std::env::var(k).ok().filter(|v| !v.is_empty())); | ||
| let s = match val.as_deref() { | ||
| Some(s) if s != "C" && s != "POSIX" => s, | ||
| _ => return MbEncoding::Utf8, | ||
| }; | ||
| if let Some(enc) = s.split('.').nth(1) { | ||
| let enc = enc.split('@').next().unwrap_or(enc); | ||
| encoding_from_name(&enc.to_ascii_lowercase()) | ||
| } else { | ||
| // Bare locale defaults from glibc localedata/SUPPORTED | ||
| match s.split('@').next().unwrap_or(s) { | ||
| "zh_CN" | "zh_SG" => MbEncoding::Gb18030, | ||
| "zh_TW" | "zh_HK" => MbEncoding::Big5, | ||
| _ => MbEncoding::Utf8, | ||
| } | ||
| } | ||
| }) | ||
| } | ||
|
|
||
| /// Byte length of the first character in `bytes` under the current locale encoding. | ||
| pub fn mb_char_len(bytes: &[u8]) -> usize { | ||
| debug_assert!(!bytes.is_empty()); | ||
| let b0 = bytes[0]; | ||
| if b0 <= 0x7F { | ||
| return 1; | ||
| } | ||
| match get_encoding() { | ||
| MbEncoding::Utf8 => utf8_len(bytes, b0), | ||
| MbEncoding::Gb18030 => gb18030_len(bytes, b0), | ||
| MbEncoding::EucJp => eucjp_len(bytes, b0), | ||
| MbEncoding::EucKr => euckr_len(bytes, b0), | ||
| MbEncoding::Big5 => big5_len(bytes, b0), | ||
| } | ||
| } | ||
|
|
||
| // All helpers below assume b0 > 0x7F (ASCII already handled by caller). | ||
|
|
||
| fn utf8_len(b: &[u8], b0: u8) -> usize { | ||
| let n = match b0 { | ||
| 0xC2..=0xDF => 2, | ||
| 0xE0..=0xEF => 3, | ||
| 0xF0..=0xF4 => 4, | ||
| _ => return 1, | ||
| }; | ||
| if b.len() >= n && b[1..n].iter().all(|&c| c & 0xC0 == 0x80) { | ||
| n | ||
| } else { | ||
| 1 | ||
| } | ||
| } | ||
|
|
||
| // 2-byte: [81-FE][40-7E,80-FE] 4-byte: [81-FE][30-39][81-FE][30-39] | ||
| fn gb18030_len(b: &[u8], b0: u8) -> usize { | ||
| if !(0x81..=0xFE).contains(&b0) { | ||
| return 1; | ||
| } | ||
| if b.len() >= 4 | ||
| && (0x30..=0x39).contains(&b[1]) | ||
| && (0x81..=0xFE).contains(&b[2]) | ||
| && (0x30..=0x39).contains(&b[3]) | ||
| { | ||
| return 4; | ||
| } | ||
| if b.len() >= 2 && ((0x40..=0x7E).contains(&b[1]) || (0x80..=0xFE).contains(&b[1])) { | ||
| return 2; | ||
| } | ||
| 1 | ||
| } | ||
|
|
||
| // 3-byte: [8F][A1-FE][A1-FE] 2-byte: [8E][A1-DF] or [A1-FE][A1-FE] | ||
| fn eucjp_len(b: &[u8], b0: u8) -> usize { | ||
| if b0 == 0x8F && b.len() >= 3 && (0xA1..=0xFE).contains(&b[1]) && (0xA1..=0xFE).contains(&b[2]) | ||
| { | ||
| return 3; | ||
| } | ||
| if b.len() >= 2 { | ||
| if b0 == 0x8E && (0xA1..=0xDF).contains(&b[1]) { | ||
| return 2; | ||
| } | ||
| if (0xA1..=0xFE).contains(&b0) && (0xA1..=0xFE).contains(&b[1]) { | ||
| return 2; | ||
| } | ||
| } | ||
| 1 | ||
| } | ||
|
|
||
| // 2-byte: [A1-FE][A1-FE] | ||
| fn euckr_len(b: &[u8], b0: u8) -> usize { | ||
| if (0xA1..=0xFE).contains(&b0) && b.len() >= 2 && (0xA1..=0xFE).contains(&b[1]) { | ||
| 2 | ||
| } else { | ||
| 1 | ||
| } | ||
| } | ||
|
|
||
| // 2-byte: [81-FE][40-7E,A1-FE] | ||
| fn big5_len(b: &[u8], b0: u8) -> usize { | ||
| if (0x81..=0xFE).contains(&b0) | ||
| && b.len() >= 2 | ||
| && ((0x40..=0x7E).contains(&b[1]) || (0xA1..=0xFE).contains(&b[1])) | ||
| { | ||
| 2 | ||
| } else { | ||
| 1 | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -162,6 +162,9 @@ fi | |
| grep -rl 'path_prepend_' tests/* | xargs -r "${SED}" -i 's| path_prepend_ ./src||' | ||
| # path_prepend_ sets $abs_path_dir_: set it manually instead. | ||
| grep -rl '\$abs_path_dir_' tests/*/*.sh | xargs -r "${SED}" -i "s|\$abs_path_dir_|${UU_BUILD_DIR//\//\\/}|g" | ||
| # Some tests use $abs_top_builddir/src for shebangs: point them to the uutils build dir. | ||
| grep -rl '\$abs_top_builddir/src' tests/*/*.sh tests/*/*.pl | xargs -r "${SED}" -i "s|\$abs_top_builddir/src|${UU_BUILD_DIR//\//\\/}|g" | ||
| grep -rl '\$ENV{abs_top_builddir}/src' tests/*/*.pl | xargs -r "${SED}" -i "s|\$ENV{abs_top_builddir}/src|${UU_BUILD_DIR//\//\\/}|g" | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @pixelb Additional
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Whoops that is unrelated but it changes two skips to passes, was working on that locally and it got added
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. According to the logs I think we deleted the gnu coreutils binaries from that env so it means that it just skips because its unable to find a binary.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we simply symlink our bins to
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
|
|
||
| # We can't build runcon and chcon without libselinux. But GNU no longer builds dummies of them. So consider they are SELinux specific. | ||
| sed -i 's/^print_ver_.*/require_selinux_/' tests/runcon/runcon-compute.sh | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Potential panic if mb_char_len returns non-zero but bytes[i..] is shorter than returned length, no?