Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/uu/paste/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ path = "src/paste.rs"

[dependencies]
clap = { workspace = true }
uucore = { workspace = true }
uucore = { workspace = true, features = ["i18n-charmap"] }
fluent = { workspace = true }

[[bin]]
Expand Down
101 changes: 41 additions & 60 deletions src/uu/paste/src/paste.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ use std::rc::Rc;
use std::slice::Iter;
use uucore::error::{UResult, USimpleError};
use uucore::format_usage;
use uucore::i18n::charmap::mb_char_len;
use uucore::line_ending::LineEnding;
use uucore::translate;

Expand All @@ -29,7 +30,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
let matches = uucore::clap_localization::handle_clap_result(uu_app(), args)?;

let serial = matches.get_flag(options::SERIAL);
let delimiters = matches.get_one::<String>(options::DELIMITER).unwrap();
let delimiters = matches.get_one::<OsString>(options::DELIMITER).unwrap();
let files = matches
.get_many::<OsString>(options::FILE)
.unwrap()
Expand Down Expand Up @@ -61,7 +62,8 @@ pub fn uu_app() -> Command {
.help(translate!("paste-help-delimiter"))
.value_name("LIST")
.default_value("\t")
.hide_default_value(true),
.hide_default_value(true)
.value_parser(clap::value_parser!(OsString)),
)
.arg(
Arg::new(options::FILE)
Expand All @@ -84,7 +86,7 @@ pub fn uu_app() -> Command {
fn paste(
filenames: Vec<OsString>,
serial: bool,
delimiters: &str,
delimiters: &OsString,
line_ending: LineEnding,
) -> UResult<()> {
let unescaped_and_encoded_delimiters = parse_delimiters(delimiters)?;
Expand Down Expand Up @@ -185,65 +187,44 @@ fn paste(
Ok(())
}

fn parse_delimiters(delimiters: &str) -> UResult<Box<[Box<[u8]>]>> {
/// A single backslash char
const BACKSLASH: char = '\\';

fn add_one_byte_single_char_delimiter(vec: &mut Vec<Box<[u8]>>, byte: u8) {
vec.push(Box::new([byte]));
}

// a buffer of length four is large enough to encode any char
let mut buffer = [0; 4];

let mut add_single_char_delimiter = |vec: &mut Vec<Box<[u8]>>, ch: char| {
let delimiter_encoded = ch.encode_utf8(&mut buffer);

vec.push(Box::<[u8]>::from(delimiter_encoded.as_bytes()));
};

let mut vec = Vec::<Box<[u8]>>::with_capacity(delimiters.len());

let mut chars = delimiters.chars();

// Unescape all special characters
while let Some(char) = chars.next() {
match char {
BACKSLASH => match chars.next() {
// "Empty string (not a null character)"
// https://pubs.opengroup.org/onlinepubs/9799919799/utilities/paste.html
Some('0') => {
vec.push(Box::<[u8; 0]>::new([]));
}
// "\\" to "\" (U+005C)
Some(BACKSLASH) => {
add_one_byte_single_char_delimiter(&mut vec, b'\\');
}
// "\n" to U+000A
Some('n') => {
add_one_byte_single_char_delimiter(&mut vec, b'\n');
}
// "\t" to U+0009
Some('t') => {
add_one_byte_single_char_delimiter(&mut vec, b'\t');
}
Some(other_char) => {
// "If any other characters follow the <backslash>, the results are unspecified."
// https://pubs.opengroup.org/onlinepubs/9799919799/utilities/paste.html
// However, other implementations remove the backslash
// See "test_posix_unspecified_delimiter"
add_single_char_delimiter(&mut vec, other_char);
}
None => {
return Err(USimpleError::new(
1,
translate!("paste-error-delimiter-unescaped-backslash", "delimiters" => delimiters),
));
fn parse_delimiters(delimiters: &OsString) -> UResult<Box<[Box<[u8]>]>> {
let bytes = uucore::os_str_as_bytes(delimiters)?;
let mut vec = Vec::<Box<[u8]>>::with_capacity(bytes.len());
let mut i = 0;

while i < bytes.len() {
if bytes[i] == b'\\' {
i += 1;
if i >= bytes.len() {
return Err(USimpleError::new(
1,
translate!("paste-error-delimiter-unescaped-backslash", "delimiters" => delimiters.to_string_lossy()),
));
}
match bytes[i] {
b'0' => vec.push(Box::new([])),
b'\\' => vec.push(Box::new([b'\\'])),
b'n' => vec.push(Box::new([b'\n'])),
b't' => vec.push(Box::new([b'\t'])),
b'b' => vec.push(Box::new([b'\x08'])),
b'f' => vec.push(Box::new([b'\x0C'])),
b'r' => vec.push(Box::new([b'\r'])),
b'v' => vec.push(Box::new([b'\x0B'])),
_ => {
// Unknown escape: strip backslash, use the following character(s)
let remaining = &bytes[i..];
let len = mb_char_len(remaining).min(remaining.len());
vec.push(Box::from(&bytes[i..i + len]));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Potential panic if mb_char_len returns non-zero but bytes[i..] is shorter than returned length, no?

i += len;
continue;
}
},
non_backslash_char => {
add_single_char_delimiter(&mut vec, non_backslash_char);
}
i += 1;
} else {
let remaining = &bytes[i..];
let len = mb_char_len(remaining).min(remaining.len());
vec.push(Box::from(&bytes[i..i + len]));
i += len;
}
}

Expand Down
3 changes: 2 additions & 1 deletion src/uucore/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,8 @@ format = [
"quoting-style",
"unit-prefix",
]
i18n-all = ["i18n-collator", "i18n-decimal", "i18n-datetime"]
i18n-all = ["i18n-charmap", "i18n-collator", "i18n-decimal", "i18n-datetime"]
i18n-charmap = ["i18n-common"]
i18n-common = ["icu_locale"]
i18n-collator = ["i18n-common", "icu_collator"]
i18n-decimal = ["i18n-common", "icu_decimal", "icu_provider"]
Expand Down
140 changes: 140 additions & 0 deletions src/uucore/src/lib/features/i18n/charmap.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
// This file is part of the uutils coreutils package.
//
// For the full copyright and license information, please view the LICENSE
// file that was distributed with this source code.

// spell-checker:ignore langinfo charmap eucjp euckr euctw CTYPE HKSCS hkscs localedata

//! Locale-aware multi-byte character length detection via `LC_CTYPE`.

use std::sync::OnceLock;

enum MbEncoding {
Utf8,
Gb18030,
EucJp,
EucKr,
Big5,
}

fn encoding_from_name(enc: &str) -> MbEncoding {
match enc {
"gb18030" | "gbk" | "gb2312" => MbEncoding::Gb18030,
"euc-jp" | "eucjp" => MbEncoding::EucJp,
"euc-kr" | "euckr" => MbEncoding::EucKr,
"big5" | "big5-hkscs" | "big5hkscs" | "euc-tw" | "euctw" => MbEncoding::Big5,
_ => MbEncoding::Utf8,
}
}

fn get_encoding() -> &'static MbEncoding {
static ENCODING: OnceLock<MbEncoding> = OnceLock::new();
ENCODING.get_or_init(|| {
let val = ["LC_ALL", "LC_CTYPE", "LANG"]
.iter()
.find_map(|&k| std::env::var(k).ok().filter(|v| !v.is_empty()));
let s = match val.as_deref() {
Some(s) if s != "C" && s != "POSIX" => s,
_ => return MbEncoding::Utf8,
};
if let Some(enc) = s.split('.').nth(1) {
let enc = enc.split('@').next().unwrap_or(enc);
encoding_from_name(&enc.to_ascii_lowercase())
} else {
// Bare locale defaults from glibc localedata/SUPPORTED
match s.split('@').next().unwrap_or(s) {
"zh_CN" | "zh_SG" => MbEncoding::Gb18030,
"zh_TW" | "zh_HK" => MbEncoding::Big5,
_ => MbEncoding::Utf8,
}
}
})
}

/// Byte length of the first character in `bytes` under the current locale encoding.
pub fn mb_char_len(bytes: &[u8]) -> usize {
debug_assert!(!bytes.is_empty());
let b0 = bytes[0];
if b0 <= 0x7F {
return 1;
}
match get_encoding() {
MbEncoding::Utf8 => utf8_len(bytes, b0),
MbEncoding::Gb18030 => gb18030_len(bytes, b0),
MbEncoding::EucJp => eucjp_len(bytes, b0),
MbEncoding::EucKr => euckr_len(bytes, b0),
MbEncoding::Big5 => big5_len(bytes, b0),
}
}

// All helpers below assume b0 > 0x7F (ASCII already handled by caller).

fn utf8_len(b: &[u8], b0: u8) -> usize {
let n = match b0 {
0xC2..=0xDF => 2,
0xE0..=0xEF => 3,
0xF0..=0xF4 => 4,
_ => return 1,
};
if b.len() >= n && b[1..n].iter().all(|&c| c & 0xC0 == 0x80) {
n
} else {
1
}
}

// 2-byte: [81-FE][40-7E,80-FE] 4-byte: [81-FE][30-39][81-FE][30-39]
fn gb18030_len(b: &[u8], b0: u8) -> usize {
if !(0x81..=0xFE).contains(&b0) {
return 1;
}
if b.len() >= 4
&& (0x30..=0x39).contains(&b[1])
&& (0x81..=0xFE).contains(&b[2])
&& (0x30..=0x39).contains(&b[3])
{
return 4;
}
if b.len() >= 2 && ((0x40..=0x7E).contains(&b[1]) || (0x80..=0xFE).contains(&b[1])) {
return 2;
}
1
}

// 3-byte: [8F][A1-FE][A1-FE] 2-byte: [8E][A1-DF] or [A1-FE][A1-FE]
fn eucjp_len(b: &[u8], b0: u8) -> usize {
if b0 == 0x8F && b.len() >= 3 && (0xA1..=0xFE).contains(&b[1]) && (0xA1..=0xFE).contains(&b[2])
{
return 3;
}
if b.len() >= 2 {
if b0 == 0x8E && (0xA1..=0xDF).contains(&b[1]) {
return 2;
}
if (0xA1..=0xFE).contains(&b0) && (0xA1..=0xFE).contains(&b[1]) {
return 2;
}
}
1
}

// 2-byte: [A1-FE][A1-FE]
fn euckr_len(b: &[u8], b0: u8) -> usize {
if (0xA1..=0xFE).contains(&b0) && b.len() >= 2 && (0xA1..=0xFE).contains(&b[1]) {
2
} else {
1
}
}

// 2-byte: [81-FE][40-7E,A1-FE]
fn big5_len(b: &[u8], b0: u8) -> usize {
if (0x81..=0xFE).contains(&b0)
&& b.len() >= 2
&& ((0x40..=0x7E).contains(&b[1]) || (0xA1..=0xFE).contains(&b[1]))
{
2
} else {
1
}
}
2 changes: 2 additions & 0 deletions src/uucore/src/lib/features/i18n/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ use std::sync::OnceLock;

use icu_locale::{Locale, locale};

#[cfg(feature = "i18n-charmap")]
pub mod charmap;
#[cfg(feature = "i18n-collator")]
pub mod collator;
#[cfg(feature = "i18n-datetime")]
Expand Down
52 changes: 52 additions & 0 deletions tests/by-util/test_paste.rs
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,30 @@ const EXAMPLE_DATA: &[TestData] = &[
ins: &["1 \na \n", "2\t\nb\t\n"],
out: "1 |2\t\na |b\t\n",
},
TestData {
name: "utf8-2byte-delim",
args: &["-d", "\u{00A2}"],
ins: &["1\n2\n", "a\nb\n"],
out: "1\u{00A2}a\n2\u{00A2}b\n",
},
TestData {
name: "utf8-3byte-delim",
args: &["-d", "\u{20AC}"],
ins: &["1\n2\n", "a\nb\n"],
out: "1\u{20AC}a\n2\u{20AC}b\n",
},
TestData {
name: "utf8-4byte-delim",
args: &["-d", "\u{1F600}", "-s"],
ins: &["1\n2\n3\n"],
out: "1\u{1F600}2\u{1F600}3\n",
},
TestData {
name: "utf8-multi-delim-cycle",
args: &["-d", "\u{00A2}\u{20AC}"],
ins: &["a\nb\nc\n", "1\n2\n3\n", "x\ny\nz\n"],
out: "a\u{00A2}1\u{20AC}x\nb\u{00A2}2\u{20AC}y\nc\u{00A2}3\u{20AC}z\n",
},
];

#[test]
Expand Down Expand Up @@ -334,6 +358,19 @@ fn test_backslash_zero_delimiter() {
}
}

#[test]
fn test_gnu_escape_sequences() {
let cases: &[(&str, u8)] = &[(r"\b", 0x08), (r"\f", 0x0C), (r"\r", 0x0D), (r"\v", 0x0B)];
for &(esc, byte) in cases {
let expected = [b'1', byte, b'2', byte, b'3', b'\n'];
new_ucmd!()
.args(&["-s", "-d", esc])
.pipe_in("1\n2\n3\n")
.succeeds()
.stdout_only_bytes(expected);
}
}

// As of 2024-10-09, only bsdutils (https://github.com/dcantrell/bsdutils, derived from FreeBSD) and toybox handle
// multibyte delimiter characters in the way a user would likely expect. BusyBox and GNU Core Utilities do not.
#[test]
Expand Down Expand Up @@ -378,6 +415,21 @@ fn test_data() {
}
}

#[test]
#[cfg(target_os = "linux")]
fn test_non_utf8_delimiter() {
let (at, mut ucmd) = at_and_ucmd!();
at.write("f1", "1\n2\n");
at.write("f2", "a\nb\n");
let delim = std::ffi::OsString::from_vec(vec![0xA2, 0xE3]);
ucmd.env("LC_ALL", "zh_CN.gb18030")
.arg("-d")
.arg(&delim)
.args(&["f1", "f2"])
.succeeds()
.stdout_only_bytes(b"1\xA2\xE3a\n2\xA2\xE3b\n");
}

#[test]
#[cfg(target_os = "linux")]
fn test_paste_non_utf8_paths() {
Expand Down
3 changes: 3 additions & 0 deletions util/build-gnu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,9 @@ fi
grep -rl 'path_prepend_' tests/* | xargs -r "${SED}" -i 's| path_prepend_ ./src||'
# path_prepend_ sets $abs_path_dir_: set it manually instead.
grep -rl '\$abs_path_dir_' tests/*/*.sh | xargs -r "${SED}" -i "s|\$abs_path_dir_|${UU_BUILD_DIR//\//\\/}|g"
# Some tests use $abs_top_builddir/src for shebangs: point them to the uutils build dir.
grep -rl '\$abs_top_builddir/src' tests/*/*.sh tests/*/*.pl | xargs -r "${SED}" -i "s|\$abs_top_builddir/src|${UU_BUILD_DIR//\//\\/}|g"
grep -rl '\$ENV{abs_top_builddir}/src' tests/*/*.pl | xargs -r "${SED}" -i "s|\$ENV{abs_top_builddir}/src|${UU_BUILD_DIR//\//\\/}|g"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@pixelb Additional abs_top_builddir

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Whoops that is unrelated but it changes two skips to passes, was working on that locally and it got added

Copy link
Contributor

@oech3 oech3 Feb 9, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But does not mean that we are not using uutils binaries at here if we don't sed?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

According to the logs I think we deleted the gnu coreutils binaries from that env so it means that it just skips because its unable to find a binary.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we simply symlink our bins to abs_top_builddir for all tests at once?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


# We can't build runcon and chcon without libselinux. But GNU no longer builds dummies of them. So consider they are SELinux specific.
sed -i 's/^print_ver_.*/require_selinux_/' tests/runcon/runcon-compute.sh
Expand Down
Loading