From bce9dd1ed2cdcf01509b6fbf5302ffaa7714e935 Mon Sep 17 00:00:00 2001 From: Jeff Vander Stoep Date: Mon, 5 Feb 2024 09:53:11 +0100 Subject: Upgrade shlex to 1.3.0 This project was upgraded with external_updater. Usage: tools/external_updater/updater.sh update external/rust/crates/shlex For more info, check https://cs.android.com/android/platform/superproject/+/main:tools/external_updater/README.md Test: TreeHugger Change-Id: Ibec8de1e338a506c887a28fd2629d887b0d3513e --- .cargo_vcs_info.json | 7 +- .gitignore | 9 +- Android.bp | 4 +- CHANGELOG.md | 4 + Cargo.toml | 27 ++- Cargo.toml.orig | 9 +- METADATA | 25 +-- README.md | 13 +- src/bytes.rs | 576 +++++++++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 399 +++++++++++++++++++++------------- src/quoting_warning.md | 365 +++++++++++++++++++++++++++++++ 11 files changed, 1262 insertions(+), 176 deletions(-) create mode 100644 src/bytes.rs create mode 100644 src/quoting_warning.md diff --git a/.cargo_vcs_info.json b/.cargo_vcs_info.json index 02eb82f..efa0c6e 100644 --- a/.cargo_vcs_info.json +++ b/.cargo_vcs_info.json @@ -1,5 +1,6 @@ { "git": { - "sha1": "8638f145d9356eed9c83e7b2f13c5209e72f0e27" - } -} + "sha1": "4a0724b0b62ef715467875b040a890ce75a8a829" + }, + "path_in_vcs": "" +} \ No newline at end of file diff --git a/.gitignore b/.gitignore index cd23ebe..d36a04d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,8 @@ -/target/ -Cargo.lock +nocommit/ +target/ +artifacts/ +corpus/ +/Cargo.lock **/*.rs.bk +.*.sw? +.sw? diff --git a/Android.bp b/Android.bp index ace93db..3ddb79c 100644 --- a/Android.bp +++ b/Android.bp @@ -42,7 +42,7 @@ rust_library_host { name: "libshlex", crate_name: "shlex", cargo_env_compat: true, - cargo_pkg_version: "1.1.0", + cargo_pkg_version: "1.3.0", srcs: ["src/lib.rs"], edition: "2015", features: [ @@ -55,7 +55,7 @@ rust_test_host { name: "shlex_test_src_lib", crate_name: "shlex", cargo_env_compat: true, - cargo_pkg_version: "1.1.0", + cargo_pkg_version: "1.3.0", srcs: ["src/lib.rs"], test_suites: ["general-tests"], auto_gen_config: true, diff --git a/CHANGELOG.md b/CHANGELOG.md index 50d2e6e..95552b4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +# 1.2.0 + +* Adds `bytes` module to support operating directly on byte strings. + # 1.1.0 * Adds the `std` feature (enabled by default) diff --git a/Cargo.toml b/Cargo.toml index 2741ed8..2b66892 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,19 +3,30 @@ # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies -# to registry (e.g., crates.io) dependencies +# to registry (e.g., crates.io) dependencies. # -# If you believe there's an error in this file please file an -# issue against the rust-lang/cargo repository. If you're -# editing this file be aware that the upstream Cargo.toml -# will likely look very different (and much more reasonable) +# If you are reading this file be aware that the original Cargo.toml +# will likely look very different (and much more reasonable). +# See Cargo.toml.orig for the original contents. [package] +rust-version = "1.46.0" name = "shlex" -version = "1.1.0" -authors = ["comex ", "Fenhl "] +version = "1.3.0" +authors = [ + "comex ", + "Fenhl ", + "Adrian Taylor ", + "Alex Touchet ", + "Daniel Parks ", + "Garrett Berg ", +] description = "Split a string into shell words, like Python's shlex." -categories = ["command-line-interface", "parser-implementations"] +readme = "README.md" +categories = [ + "command-line-interface", + "parser-implementations", +] license = "MIT OR Apache-2.0" repository = "https://github.com/comex/rust-shlex" diff --git a/Cargo.toml.orig b/Cargo.toml.orig index 57fb62b..c3644af 100644 --- a/Cargo.toml.orig +++ b/Cargo.toml.orig @@ -1,9 +1,13 @@ [package] name = "shlex" -version = "1.1.0" +version = "1.3.0" authors = [ "comex ", - "Fenhl " + "Fenhl ", + "Adrian Taylor ", + "Alex Touchet ", + "Daniel Parks ", + "Garrett Berg ", ] license = "MIT OR Apache-2.0" repository = "https://github.com/comex/rust-shlex" @@ -12,6 +16,7 @@ categories = [ "command-line-interface", "parser-implementations" ] +rust-version = "1.46.0" [features] std = [] diff --git a/METADATA b/METADATA index 9ee61c1..b7ec356 100644 --- a/METADATA +++ b/METADATA @@ -1,19 +1,20 @@ +# This project was upgraded with external_updater. +# Usage: tools/external_updater/updater.sh update external/rust/crates/shlex +# For more info, check https://cs.android.com/android/platform/superproject/+/main:tools/external_updater/README.md + name: "shlex" description: "Split a string into shell words, like Python\'s shlex." third_party { - url { - type: HOMEPAGE - value: "https://crates.io/crates/shlex" - } - url { - type: ARCHIVE - value: "https://static.crates.io/crates/shlex/shlex-1.1.0.crate" - } - version: "1.1.0" license_type: NOTICE last_upgrade_date { - year: 2021 - month: 9 - day: 22 + year: 2024 + month: 2 + day: 5 + } + homepage: "https://crates.io/crates/shlex" + identifier { + type: "Archive" + value: "https://static.crates.io/crates/shlex/shlex-1.3.0.crate" + version: "1.3.0" } } diff --git a/README.md b/README.md index 6778828..6400a6f 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,11 @@ +[![ci badge]][ci link] [![crates.io badge]][crates.io link] [![docs.rs badge]][docs.rs link] + +[crates.io badge]: https://img.shields.io/crates/v/shlex.svg?style=flat-square +[crates.io link]: https://crates.io/crates/shlex +[docs.rs badge]: https://img.shields.io/badge/docs-online-dddddd.svg?style=flat-square +[docs.rs link]: https://docs.rs/shlex +[ci badge]: https://img.shields.io/github/actions/workflow/status/comex/rust-shlex/test.yml?branch=master&style=flat-square +[ci link]: https://github.com/comex/rust-shlex/actions Same idea as (but implementation not directly based on) the Python shlex module. However, this implementation does not support any of the Python @@ -8,8 +16,9 @@ You only get the default settings of shlex.split, which mimic the POSIX shell: This implementation also deviates from the Python version in not treating \r specially, which I believe is more compliant. -The algorithms in this crate are oblivious to UTF-8 high bytes, so they iterate -over the bytes directly as a micro-optimization. +This crate can be used on either normal Rust strings, or on byte strings with +the `bytes` module. The algorithms used are oblivious to UTF-8 high bytes, so +internally they all work on bytes directly as a micro-optimization. Disabling the `std` feature (which is enabled by default) will allow the crate to work in `no_std` environments, where the `alloc` crate, and a global diff --git a/src/bytes.rs b/src/bytes.rs new file mode 100644 index 0000000..af8daad --- /dev/null +++ b/src/bytes.rs @@ -0,0 +1,576 @@ +// Copyright 2015 Nicholas Allegra (comex). +// Licensed under the Apache License, Version 2.0 or +// the MIT license , at your option. This file may not be +// copied, modified, or distributed except according to those terms. + +//! [`Shlex`] and friends for byte strings. +//! +//! This is used internally by the [outer module](crate), and may be more +//! convenient if you are working with byte slices (`[u8]`) or types that are +//! wrappers around bytes, such as [`OsStr`](std::ffi::OsStr): +//! +//! ```rust +//! #[cfg(unix)] { +//! use shlex::bytes::quote; +//! use std::ffi::OsStr; +//! use std::os::unix::ffi::OsStrExt; +//! +//! // `\x80` is invalid in UTF-8. +//! let os_str = OsStr::from_bytes(b"a\x80b c"); +//! assert_eq!(quote(os_str.as_bytes()), &b"'a\x80b c'"[..]); +//! } +//! ``` +//! +//! (On Windows, `OsStr` uses 16 bit wide characters so this will not work.) + +extern crate alloc; +use alloc::vec::Vec; +use alloc::borrow::Cow; +#[cfg(test)] +use alloc::vec; +#[cfg(test)] +use alloc::borrow::ToOwned; +#[cfg(all(doc, not(doctest)))] +use crate::{self as shlex, quoting_warning}; + +use super::QuoteError; + +/// An iterator that takes an input byte string and splits it into the words using the same syntax as +/// the POSIX shell. +pub struct Shlex<'a> { + in_iter: core::slice::Iter<'a, u8>, + /// The number of newlines read so far, plus one. + pub line_no: usize, + /// An input string is erroneous if it ends while inside a quotation or right after an + /// unescaped backslash. Since Iterator does not have a mechanism to return an error, if that + /// happens, Shlex just throws out the last token, ends the iteration, and sets 'had_error' to + /// true; best to check it after you're done iterating. + pub had_error: bool, +} + +impl<'a> Shlex<'a> { + pub fn new(in_bytes: &'a [u8]) -> Self { + Shlex { + in_iter: in_bytes.iter(), + line_no: 1, + had_error: false, + } + } + + fn parse_word(&mut self, mut ch: u8) -> Option> { + let mut result: Vec = Vec::new(); + loop { + match ch as char { + '"' => if let Err(()) = self.parse_double(&mut result) { + self.had_error = true; + return None; + }, + '\'' => if let Err(()) = self.parse_single(&mut result) { + self.had_error = true; + return None; + }, + '\\' => if let Some(ch2) = self.next_char() { + if ch2 != '\n' as u8 { result.push(ch2); } + } else { + self.had_error = true; + return None; + }, + ' ' | '\t' | '\n' => { break; }, + _ => { result.push(ch as u8); }, + } + if let Some(ch2) = self.next_char() { ch = ch2; } else { break; } + } + Some(result) + } + + fn parse_double(&mut self, result: &mut Vec) -> Result<(), ()> { + loop { + if let Some(ch2) = self.next_char() { + match ch2 as char { + '\\' => { + if let Some(ch3) = self.next_char() { + match ch3 as char { + // \$ => $ + '$' | '`' | '"' | '\\' => { result.push(ch3); }, + // \ => nothing + '\n' => {}, + // \x => =x + _ => { result.push('\\' as u8); result.push(ch3); } + } + } else { + return Err(()); + } + }, + '"' => { return Ok(()); }, + _ => { result.push(ch2); }, + } + } else { + return Err(()); + } + } + } + + fn parse_single(&mut self, result: &mut Vec) -> Result<(), ()> { + loop { + if let Some(ch2) = self.next_char() { + match ch2 as char { + '\'' => { return Ok(()); }, + _ => { result.push(ch2); }, + } + } else { + return Err(()); + } + } + } + + fn next_char(&mut self) -> Option { + let res = self.in_iter.next().copied(); + if res == Some(b'\n') { self.line_no += 1; } + res + } +} + +impl<'a> Iterator for Shlex<'a> { + type Item = Vec; + fn next(&mut self) -> Option { + if let Some(mut ch) = self.next_char() { + // skip initial whitespace + loop { + match ch as char { + ' ' | '\t' | '\n' => {}, + '#' => { + while let Some(ch2) = self.next_char() { + if ch2 as char == '\n' { break; } + } + }, + _ => { break; } + } + if let Some(ch2) = self.next_char() { ch = ch2; } else { return None; } + } + self.parse_word(ch) + } else { // no initial character + None + } + } + +} + +/// Convenience function that consumes the whole byte string at once. Returns None if the input was +/// erroneous. +pub fn split(in_bytes: &[u8]) -> Option>> { + let mut shl = Shlex::new(in_bytes); + let res = shl.by_ref().collect(); + if shl.had_error { None } else { Some(res) } +} + +/// A more configurable interface to quote strings. If you only want the default settings you can +/// use the convenience functions [`try_quote`] and [`try_join`]. +/// +/// The string equivalent is [`shlex::Quoter`]. +#[derive(Default, Debug, Clone)] +pub struct Quoter { + allow_nul: bool, + // TODO: more options +} + +impl Quoter { + /// Create a new [`Quoter`] with default settings. + #[inline] + pub fn new() -> Self { + Self::default() + } + + /// Set whether to allow [nul bytes](quoting_warning#nul-bytes). By default they are not + /// allowed and will result in an error of [`QuoteError::Nul`]. + #[inline] + pub fn allow_nul(mut self, allow: bool) -> Self { + self.allow_nul = allow; + self + } + + /// Convenience function that consumes an iterable of words and turns it into a single byte string, + /// quoting words when necessary. Consecutive words will be separated by a single space. + pub fn join<'a, I: IntoIterator>(&self, words: I) -> Result, QuoteError> { + Ok(words.into_iter() + .map(|word| self.quote(word)) + .collect::>, QuoteError>>()? + .join(&b' ')) + } + + /// Given a single word, return a byte string suitable to encode it as a shell argument. + /// + /// If given valid UTF-8, this will never produce invalid UTF-8. This is because it only + /// ever inserts valid ASCII characters before or after existing ASCII characters (or + /// returns two single quotes if the input was an empty string). It will never modify a + /// multibyte UTF-8 character. + pub fn quote<'a>(&self, mut in_bytes: &'a [u8]) -> Result, QuoteError> { + if in_bytes.is_empty() { + // Empty string. Special case that isn't meaningful as only part of a word. + return Ok(b"''"[..].into()); + } + if !self.allow_nul && in_bytes.iter().any(|&b| b == b'\0') { + return Err(QuoteError::Nul); + } + let mut out: Vec = Vec::new(); + while !in_bytes.is_empty() { + // Pick a quoting strategy for some prefix of the input. Normally this will cover the + // entire input, but in some case we might need to divide the input into multiple chunks + // that are quoted differently. + let (cur_len, strategy) = quoting_strategy(in_bytes); + if cur_len == in_bytes.len() && strategy == QuotingStrategy::Unquoted && out.is_empty() { + // Entire string can be represented unquoted. Reuse the allocation. + return Ok(in_bytes.into()); + } + let (cur_chunk, rest) = in_bytes.split_at(cur_len); + assert!(rest.len() < in_bytes.len()); // no infinite loop + in_bytes = rest; + append_quoted_chunk(&mut out, cur_chunk, strategy); + } + Ok(out.into()) + } + +} + +#[derive(PartialEq)] +enum QuotingStrategy { + /// No quotes and no backslash escapes. (If backslash escapes would be necessary, we use a + /// different strategy instead.) + Unquoted, + /// Single quoted. + SingleQuoted, + /// Double quotes, potentially with backslash escapes. + DoubleQuoted, + // TODO: add $'xxx' and "$(printf 'xxx')" styles +} + +/// Is this ASCII byte okay to emit unquoted? +const fn unquoted_ok(c: u8) -> bool { + match c as char { + // Allowed characters: + '+' | '-' | '.' | '/' | ':' | '@' | ']' | '_' | + '0'..='9' | 'A'..='Z' | 'a'..='z' + => true, + + // Non-allowed characters: + // From POSIX https://pubs.opengroup.org/onlinepubs/9699919799/utilities/V3_chap02.html + // "The application shall quote the following characters if they are to represent themselves:" + '|' | '&' | ';' | '<' | '>' | '(' | ')' | '$' | '`' | '\\' | '"' | '\'' | ' ' | '\t' | '\n' | + // "and the following may need to be quoted under certain circumstances[..]:" + '*' | '?' | '[' | '#' | '~' | '=' | '%' | + // Brace expansion. These ought to be in the POSIX list but aren't yet; + // see: https://www.austingroupbugs.net/view.php?id=1193 + '{' | '}' | + // Also quote comma, just to be safe in the extremely odd case that the user of this crate + // is intentionally placing a quoted string inside a brace expansion, e.g.: + // format!("echo foo{{a,b,{}}}" | shlex::quote(some_str)) + ',' | + // '\r' is allowed in a word by all real shells I tested, but is treated as a word + // separator by Python `shlex` | and might be translated to '\n' in interactive mode. + '\r' | + // '!' and '^' are treated specially in interactive mode; see quoting_warning. + '!' | '^' | + // Nul bytes and control characters. + '\x00' ..= '\x1f' | '\x7f' + => false, + '\u{80}' ..= '\u{10ffff}' => { + // This is unreachable since `unquoted_ok` is only called for 0..128. + // Non-ASCII bytes are handled separately in `quoting_strategy`. + // Can't call unreachable!() from `const fn` on old Rust, so... + unquoted_ok(c) + }, + } + // Note: The logic cited above for quoting comma might suggest that `..` should also be quoted, + // it as a special case of brace expansion). But it's not necessary. There are three cases: + // + // 1. The user wants comma-based brace expansion, but the untrusted string being `quote`d + // contains `..`, so they get something like `{foo,bar,3..5}`. + // => That's safe; both Bash and Zsh expand this to `foo bar 3..5` rather than + // `foo bar 3 4 5`. The presence of commas disables sequence expression expansion. + // + // 2. The user wants comma-based brace expansion where the contents of the braces are a + // variable number of `quote`d strings and nothing else. There happens to be exactly + // one string and it contains `..`, so they get something like `{3..5}`. + // => Then this will expand as a sequence expression, which is unintended. But I don't mind, + // because any such code is already buggy. Suppose the untrusted string *didn't* contain + // `,` or `..`, resulting in shell input like `{foo}`. Then the shell would interpret it + // as the literal string `{foo}` rather than brace-expanding it into `foo`. + // + // 3. The user wants a sequence expression and wants to supply an untrusted string as one of + // the endpoints or the increment. + // => Well, that's just silly, since the endpoints can only be numbers or single letters. +} + +/// Optimized version of `unquoted_ok`. +fn unquoted_ok_fast(c: u8) -> bool { + const UNQUOTED_OK_MASK: u128 = { + // Make a mask of all bytes in 0..<0x80 that pass. + let mut c = 0u8; + let mut mask = 0u128; + while c < 0x80 { + if unquoted_ok(c) { + mask |= 1u128 << c; + } + c += 1; + } + mask + }; + ((UNQUOTED_OK_MASK >> c) & 1) != 0 +} + +/// Is this ASCII byte okay to emit in single quotes? +fn single_quoted_ok(c: u8) -> bool { + match c { + // No single quotes in single quotes. + b'\'' => false, + // To work around a Bash bug, ^ is only allowed right after an opening single quote; see + // quoting_warning. + b'^' => false, + // Backslashes in single quotes are literal according to POSIX, but Fish treats them as an + // escape character. Ban them. Fish doesn't aim to be POSIX-compatible, but we *can* + // achieve Fish compatibility using double quotes, so we might as well. + b'\\' => false, + _ => true + } +} + +/// Is this ASCII byte okay to emit in double quotes? +fn double_quoted_ok(c: u8) -> bool { + match c { + // Work around Python `shlex` bug where parsing "\`" and "\$" doesn't strip the + // backslash, even though POSIX requires it. + b'`' | b'$' => false, + // '!' and '^' are treated specially in interactive mode; see quoting_warning. + b'!' | b'^' => false, + _ => true + } +} + +/// Given an input, return a quoting strategy that can cover some prefix of the string, along with +/// the size of that prefix. +/// +/// Precondition: input size is nonzero. (Empty strings are handled by the caller.) +/// Postcondition: returned size is nonzero. +#[cfg_attr(manual_codegen_check, inline(never))] +fn quoting_strategy(in_bytes: &[u8]) -> (usize, QuotingStrategy) { + const UNQUOTED_OK: u8 = 1; + const SINGLE_QUOTED_OK: u8 = 2; + const DOUBLE_QUOTED_OK: u8 = 4; + + let mut prev_ok = SINGLE_QUOTED_OK | DOUBLE_QUOTED_OK | UNQUOTED_OK; + let mut i = 0; + + if in_bytes[0] == b'^' { + // To work around a Bash bug, ^ is only allowed right after an opening single quote; see + // quoting_warning. + prev_ok = SINGLE_QUOTED_OK; + i = 1; + } + + while i < in_bytes.len() { + let c = in_bytes[i]; + let mut cur_ok = prev_ok; + + if c >= 0x80 { + // Normally, non-ASCII characters shouldn't require quoting, but see quoting_warning.md + // about \xa0. For now, just treat all non-ASCII characters as requiring quotes. This + // also ensures things are safe in the off-chance that you're in a legacy 8-bit locale that + // has additional characters satisfying `isblank`. + cur_ok &= !UNQUOTED_OK; + } else { + if !unquoted_ok_fast(c) { + cur_ok &= !UNQUOTED_OK; + } + if !single_quoted_ok(c){ + cur_ok &= !SINGLE_QUOTED_OK; + } + if !double_quoted_ok(c) { + cur_ok &= !DOUBLE_QUOTED_OK; + } + } + + if cur_ok == 0 { + // There are no quoting strategies that would work for both the previous characters and + // this one. So we have to end the chunk before this character. The caller will call + // `quoting_strategy` again to handle the rest of the string. + break; + } + + prev_ok = cur_ok; + i += 1; + } + + // Pick the best allowed strategy. + let strategy = if prev_ok & UNQUOTED_OK != 0 { + QuotingStrategy::Unquoted + } else if prev_ok & SINGLE_QUOTED_OK != 0 { + QuotingStrategy::SingleQuoted + } else if prev_ok & DOUBLE_QUOTED_OK != 0 { + QuotingStrategy::DoubleQuoted + } else { + unreachable!() + }; + debug_assert!(i > 0); + (i, strategy) +} + +fn append_quoted_chunk(out: &mut Vec, cur_chunk: &[u8], strategy: QuotingStrategy) { + match strategy { + QuotingStrategy::Unquoted => { + out.extend_from_slice(cur_chunk); + }, + QuotingStrategy::SingleQuoted => { + out.reserve(cur_chunk.len() + 2); + out.push(b'\''); + out.extend_from_slice(cur_chunk); + out.push(b'\''); + }, + QuotingStrategy::DoubleQuoted => { + out.reserve(cur_chunk.len() + 2); + out.push(b'"'); + for &c in cur_chunk.into_iter() { + if let b'$' | b'`' | b'"' | b'\\' = c { + // Add a preceding backslash. + // Note: We shouldn't actually get here for $ and ` because they don't pass + // `double_quoted_ok`. + out.push(b'\\'); + } + // Add the character itself. + out.push(c); + } + out.push(b'"'); + }, + } +} + +/// Convenience function that consumes an iterable of words and turns it into a single byte string, +/// quoting words when necessary. Consecutive words will be separated by a single space. +/// +/// Uses default settings except that nul bytes are passed through, which [may be +/// dangerous](quoting_warning#nul-bytes), leading to this function being deprecated. +/// +/// Equivalent to [`Quoter::new().allow_nul(true).join(words).unwrap()`](Quoter). +/// +/// (That configuration never returns `Err`, so this function does not panic.) +/// +/// The string equivalent is [shlex::join]. +#[deprecated(since = "1.3.0", note = "replace with `try_join(words)?` to avoid nul byte danger")] +pub fn join<'a, I: IntoIterator>(words: I) -> Vec { + Quoter::new().allow_nul(true).join(words).unwrap() +} + +/// Convenience function that consumes an iterable of words and turns it into a single byte string, +/// quoting words when necessary. Consecutive words will be separated by a single space. +/// +/// Uses default settings. The only error that can be returned is [`QuoteError::Nul`]. +/// +/// Equivalent to [`Quoter::new().join(words)`](Quoter). +/// +/// The string equivalent is [shlex::try_join]. +pub fn try_join<'a, I: IntoIterator>(words: I) -> Result, QuoteError> { + Quoter::new().join(words) +} + +/// Given a single word, return a string suitable to encode it as a shell argument. +/// +/// Uses default settings except that nul bytes are passed through, which [may be +/// dangerous](quoting_warning#nul-bytes), leading to this function being deprecated. +/// +/// Equivalent to [`Quoter::new().allow_nul(true).quote(in_bytes).unwrap()`](Quoter). +/// +/// (That configuration never returns `Err`, so this function does not panic.) +/// +/// The string equivalent is [shlex::quote]. +#[deprecated(since = "1.3.0", note = "replace with `try_quote(str)?` to avoid nul byte danger")] +pub fn quote(in_bytes: &[u8]) -> Cow<[u8]> { + Quoter::new().allow_nul(true).quote(in_bytes).unwrap() +} + +/// Given a single word, return a string suitable to encode it as a shell argument. +/// +/// Uses default settings. The only error that can be returned is [`QuoteError::Nul`]. +/// +/// Equivalent to [`Quoter::new().quote(in_bytes)`](Quoter). +/// +/// (That configuration never returns `Err`, so this function does not panic.) +/// +/// The string equivalent is [shlex::try_quote]. +pub fn try_quote(in_bytes: &[u8]) -> Result, QuoteError> { + Quoter::new().quote(in_bytes) +} + +#[cfg(test)] +const INVALID_UTF8: &[u8] = b"\xa1"; +#[cfg(test)] +const INVALID_UTF8_SINGLEQUOTED: &[u8] = b"'\xa1'"; + +#[test] +#[allow(invalid_from_utf8)] +fn test_invalid_utf8() { + // Check that our test string is actually invalid UTF-8. + assert!(core::str::from_utf8(INVALID_UTF8).is_err()); +} + +#[cfg(test)] +static SPLIT_TEST_ITEMS: &'static [(&'static [u8], Option<&'static [&'static [u8]]>)] = &[ + (b"foo$baz", Some(&[b"foo$baz"])), + (b"foo baz", Some(&[b"foo", b"baz"])), + (b"foo\"bar\"baz", Some(&[b"foobarbaz"])), + (b"foo \"bar\"baz", Some(&[b"foo", b"barbaz"])), + (b" foo \nbar", Some(&[b"foo", b"bar"])), + (b"foo\\\nbar", Some(&[b"foobar"])), + (b"\"foo\\\nbar\"", Some(&[b"foobar"])), + (b"'baz\\$b'", Some(&[b"baz\\$b"])), + (b"'baz\\\''", None), + (b"\\", None), + (b"\"\\", None), + (b"'\\", None), + (b"\"", None), + (b"'", None), + (b"foo #bar\nbaz", Some(&[b"foo", b"baz"])), + (b"foo #bar", Some(&[b"foo"])), + (b"foo#bar", Some(&[b"foo#bar"])), + (b"foo\"#bar", None), + (b"'\\n'", Some(&[b"\\n"])), + (b"'\\\\n'", Some(&[b"\\\\n"])), + (INVALID_UTF8, Some(&[INVALID_UTF8])), +]; + +#[test] +fn test_split() { + for &(input, output) in SPLIT_TEST_ITEMS { + assert_eq!(split(input), output.map(|o| o.iter().map(|&x| x.to_owned()).collect())); + } +} + +#[test] +fn test_lineno() { + let mut sh = Shlex::new(b"\nfoo\nbar"); + while let Some(word) = sh.next() { + if word == b"bar" { + assert_eq!(sh.line_no, 3); + } + } +} + +#[test] +#[allow(deprecated)] +fn test_quote() { + // Validate behavior with invalid UTF-8: + assert_eq!(quote(INVALID_UTF8), INVALID_UTF8_SINGLEQUOTED); + // Replicate a few tests from lib.rs. No need to replicate all of them. + assert_eq!(quote(b""), &b"''"[..]); + assert_eq!(quote(b"foobar"), &b"foobar"[..]); + assert_eq!(quote(b"foo bar"), &b"'foo bar'"[..]); + assert_eq!(quote(b"'\""), &b"\"'\\\"\""[..]); + assert_eq!(quote(b""), &b"''"[..]); +} + +#[test] +#[allow(deprecated)] +fn test_join() { + // Validate behavior with invalid UTF-8: + assert_eq!(join(vec![INVALID_UTF8]), INVALID_UTF8_SINGLEQUOTED); + // Replicate a few tests from lib.rs. No need to replicate all of them. + assert_eq!(join(vec![]), &b""[..]); + assert_eq!(join(vec![&b""[..]]), b"''"); +} diff --git a/src/lib.rs b/src/lib.rs index 31b54bd..aa5c306 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -3,20 +3,37 @@ // the MIT license , at your option. This file may not be // copied, modified, or distributed except according to those terms. -//! Same idea as (but implementation not directly based on) the Python shlex module. However, this -//! implementation does not support any of the Python module's customization because it makes -//! parsing slower and is fairly useless. You only get the default settings of shlex.split, which -//! mimic the POSIX shell: -//! +//! Parse strings like, and escape strings for, POSIX shells. //! -//! This implementation also deviates from the Python version in not treating `\r` specially, which -//! I believe is more compliant. -//! -//! The algorithms in this crate are oblivious to UTF-8 high bytes, so they iterate over the bytes -//! directly as a micro-optimization. +//! Same idea as (but implementation not directly based on) the Python shlex module. //! //! Disabling the `std` feature (which is enabled by default) will allow the crate to work in //! `no_std` environments, where the `alloc` crate, and a global allocator, are available. +//! +//! ## Warning +//! +//! The [`try_quote`]/[`try_join`] family of APIs does not quote control characters (because they +//! cannot be quoted portably). +//! +//! This is fully safe in noninteractive contexts, like shell scripts and `sh -c` arguments (or +//! even scripts `source`d from interactive shells). +//! +//! But if you are quoting for human consumption, you should keep in mind that ugly inputs produce +//! ugly outputs (which may not be copy-pastable). +//! +//! And if by chance you are piping the output of [`try_quote`]/[`try_join`] directly to the stdin +//! of an interactive shell, you should stop, because control characters can lead to arbitrary +//! command injection. +//! +//! For more information, and for information about more minor issues, please see [quoting_warning]. +//! +//! ## Compatibility +//! +//! This crate's quoting functionality tries to be compatible with **any POSIX-compatible shell**; +//! it's tested against `bash`, `zsh`, `dash`, Busybox `ash`, and `mksh`, plus `fish` (which is not +//! POSIX-compatible but close enough). +//! +//! It also aims to be compatible with Python `shlex` and C `wordexp`. #![cfg_attr(not(feature = "std"), no_std)] @@ -29,124 +46,45 @@ use alloc::vec; #[cfg(test)] use alloc::borrow::ToOwned; +pub mod bytes; +#[cfg(all(doc, not(doctest)))] +#[path = "quoting_warning.md"] +pub mod quoting_warning; + /// An iterator that takes an input string and splits it into the words using the same syntax as /// the POSIX shell. -pub struct Shlex<'a> { - in_iter: core::str::Bytes<'a>, - /// The number of newlines read so far, plus one. - pub line_no: usize, - /// An input string is erroneous if it ends while inside a quotation or right after an - /// unescaped backslash. Since Iterator does not have a mechanism to return an error, if that - /// happens, Shlex just throws out the last token, ends the iteration, and sets 'had_error' to - /// true; best to check it after you're done iterating. - pub had_error: bool, -} +/// +/// See [`bytes::Shlex`]. +pub struct Shlex<'a>(bytes::Shlex<'a>); impl<'a> Shlex<'a> { pub fn new(in_str: &'a str) -> Self { - Shlex { - in_iter: in_str.bytes(), - line_no: 1, - had_error: false, - } - } - - fn parse_word(&mut self, mut ch: u8) -> Option { - let mut result: Vec = Vec::new(); - loop { - match ch as char { - '"' => if let Err(()) = self.parse_double(&mut result) { - self.had_error = true; - return None; - }, - '\'' => if let Err(()) = self.parse_single(&mut result) { - self.had_error = true; - return None; - }, - '\\' => if let Some(ch2) = self.next_char() { - if ch2 != '\n' as u8 { result.push(ch2); } - } else { - self.had_error = true; - return None; - }, - ' ' | '\t' | '\n' => { break; }, - _ => { result.push(ch as u8); }, - } - if let Some(ch2) = self.next_char() { ch = ch2; } else { break; } - } - unsafe { Some(String::from_utf8_unchecked(result)) } + Self(bytes::Shlex::new(in_str.as_bytes())) } +} - fn parse_double(&mut self, result: &mut Vec) -> Result<(), ()> { - loop { - if let Some(ch2) = self.next_char() { - match ch2 as char { - '\\' => { - if let Some(ch3) = self.next_char() { - match ch3 as char { - // \$ => $ - '$' | '`' | '"' | '\\' => { result.push(ch3); }, - // \ => nothing - '\n' => {}, - // \x => =x - _ => { result.push('\\' as u8); result.push(ch3); } - } - } else { - return Err(()); - } - }, - '"' => { return Ok(()); }, - _ => { result.push(ch2); }, - } - } else { - return Err(()); - } - } +impl<'a> Iterator for Shlex<'a> { + type Item = String; + fn next(&mut self) -> Option { + self.0.next().map(|byte_word| { + // Safety: given valid UTF-8, bytes::Shlex will always return valid UTF-8. + unsafe { String::from_utf8_unchecked(byte_word) } + }) } +} - fn parse_single(&mut self, result: &mut Vec) -> Result<(), ()> { - loop { - if let Some(ch2) = self.next_char() { - match ch2 as char { - '\'' => { return Ok(()); }, - _ => { result.push(ch2); }, - } - } else { - return Err(()); - } - } - } +impl<'a> core::ops::Deref for Shlex<'a> { + type Target = bytes::Shlex<'a>; - fn next_char(&mut self) -> Option { - let res = self.in_iter.next(); - if res == Some('\n' as u8) { self.line_no += 1; } - res + fn deref(&self) -> &Self::Target { + &self.0 } } -impl<'a> Iterator for Shlex<'a> { - type Item = String; - fn next(&mut self) -> Option { - if let Some(mut ch) = self.next_char() { - // skip initial whitespace - loop { - match ch as char { - ' ' | '\t' | '\n' => {}, - '#' => { - while let Some(ch2) = self.next_char() { - if ch2 as char == '\n' { break; } - } - }, - _ => { break; } - } - if let Some(ch2) = self.next_char() { ch = ch2; } else { return None; } - } - self.parse_word(ch) - } else { // no initial character - None - } +impl<'a> core::ops::DerefMut for Shlex<'a> { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 } - } /// Convenience function that consumes the whole string at once. Returns None if the input was @@ -157,38 +95,151 @@ pub fn split(in_str: &str) -> Option> { if shl.had_error { None } else { Some(res) } } -/// Given a single word, return a string suitable to encode it as a shell argument. -pub fn quote(in_str: &str) -> Cow { - if in_str.len() == 0 { - "\"\"".into() - } else if in_str.bytes().any(|c| match c as char { - '|' | '&' | ';' | '<' | '>' | '(' | ')' | '$' | '`' | '\\' | '"' | '\'' | ' ' | '\t' | - '\r' | '\n' | '*' | '?' | '[' | '#' | '~' | '=' | '%' => true, - _ => false - }) { - let mut out: Vec = Vec::new(); - out.push('"' as u8); - for c in in_str.bytes() { - match c as char { - '$' | '`' | '"' | '\\' => out.push('\\' as u8), - _ => () - } - out.push(c); +/// Errors from [`Quoter::quote`], [`Quoter::join`], etc. (and their [`bytes`] counterparts). +/// +/// By default, the only error that can be returned is [`QuoteError::Nul`]. If you call +/// `allow_nul(true)`, then no errors can be returned at all. Any error variants added in the +/// future will not be enabled by default; they will be enabled through corresponding non-default +/// [`Quoter`] options. +/// +/// ...In theory. In the unlikely event that additional classes of inputs are discovered that, +/// like nul bytes, are fundamentally unsafe to quote even for non-interactive shells, the risk +/// will be mitigated by adding corresponding [`QuoteError`] variants that *are* enabled by +/// default. +#[non_exhaustive] +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum QuoteError { + /// The input contained a nul byte. In most cases, shells fundamentally [cannot handle strings + /// containing nul bytes](quoting_warning#nul-bytes), no matter how they are quoted. But if + /// you're sure you can handle nul bytes, you can call `allow_nul(true)` on the `Quoter` to let + /// them pass through. + Nul, +} + +impl core::fmt::Display for QuoteError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + match self { + QuoteError::Nul => f.write_str("cannot shell-quote string containing nul byte"), } - out.push('"' as u8); - unsafe { String::from_utf8_unchecked(out) }.into() - } else { - in_str.into() + } +} + +#[cfg(feature = "std")] +impl std::error::Error for QuoteError {} + +/// A more configurable interface to quote strings. If you only want the default settings you can +/// use the convenience functions [`try_quote`] and [`try_join`]. +/// +/// The bytes equivalent is [`bytes::Quoter`]. +#[derive(Default, Debug, Clone)] +pub struct Quoter { + inner: bytes::Quoter, +} + +impl Quoter { + /// Create a new [`Quoter`] with default settings. + #[inline] + pub fn new() -> Self { + Self::default() + } + + /// Set whether to allow [nul bytes](quoting_warning#nul-bytes). By default they are not + /// allowed and will result in an error of [`QuoteError::Nul`]. + #[inline] + pub fn allow_nul(mut self, allow: bool) -> Self { + self.inner = self.inner.allow_nul(allow); + self + } + + /// Convenience function that consumes an iterable of words and turns it into a single string, + /// quoting words when necessary. Consecutive words will be separated by a single space. + pub fn join<'a, I: IntoIterator>(&self, words: I) -> Result { + // Safety: given valid UTF-8, bytes::join() will always return valid UTF-8. + self.inner.join(words.into_iter().map(|s| s.as_bytes())) + .map(|bytes| unsafe { String::from_utf8_unchecked(bytes) }) + } + + /// Given a single word, return a string suitable to encode it as a shell argument. + pub fn quote<'a>(&self, in_str: &'a str) -> Result, QuoteError> { + Ok(match self.inner.quote(in_str.as_bytes())? { + Cow::Borrowed(out) => { + // Safety: given valid UTF-8, bytes::quote() will always return valid UTF-8. + unsafe { core::str::from_utf8_unchecked(out) }.into() + } + Cow::Owned(out) => { + // Safety: given valid UTF-8, bytes::quote() will always return valid UTF-8. + unsafe { String::from_utf8_unchecked(out) }.into() + } + }) + } +} + +impl From for Quoter { + fn from(inner: bytes::Quoter) -> Quoter { + Quoter { inner } + } +} + +impl From for bytes::Quoter { + fn from(quoter: Quoter) -> bytes::Quoter { + quoter.inner } } /// Convenience function that consumes an iterable of words and turns it into a single string, /// quoting words when necessary. Consecutive words will be separated by a single space. +/// +/// Uses default settings except that nul bytes are passed through, which [may be +/// dangerous](quoting_warning#nul-bytes), leading to this function being deprecated. +/// +/// Equivalent to [`Quoter::new().allow_nul(true).join(words).unwrap()`](Quoter). +/// +/// (That configuration never returns `Err`, so this function does not panic.) +/// +/// The bytes equivalent is [bytes::join]. +#[deprecated(since = "1.3.0", note = "replace with `try_join(words)?` to avoid nul byte danger")] pub fn join<'a, I: IntoIterator>(words: I) -> String { - words.into_iter() - .map(quote) - .collect::>() - .join(" ") + Quoter::new().allow_nul(true).join(words).unwrap() +} + +/// Convenience function that consumes an iterable of words and turns it into a single string, +/// quoting words when necessary. Consecutive words will be separated by a single space. +/// +/// Uses default settings. The only error that can be returned is [`QuoteError::Nul`]. +/// +/// Equivalent to [`Quoter::new().join(words)`](Quoter). +/// +/// The bytes equivalent is [bytes::try_join]. +pub fn try_join<'a, I: IntoIterator>(words: I) -> Result { + Quoter::new().join(words) +} + +/// Given a single word, return a string suitable to encode it as a shell argument. +/// +/// Uses default settings except that nul bytes are passed through, which [may be +/// dangerous](quoting_warning#nul-bytes), leading to this function being deprecated. +/// +/// Equivalent to [`Quoter::new().allow_nul(true).quote(in_str).unwrap()`](Quoter). +/// +/// (That configuration never returns `Err`, so this function does not panic.) +/// +/// The bytes equivalent is [bytes::quote]. +#[deprecated(since = "1.3.0", note = "replace with `try_quote(str)?` to avoid nul byte danger")] +pub fn quote(in_str: &str) -> Cow { + Quoter::new().allow_nul(true).quote(in_str).unwrap() +} + +/// Given a single word, return a string suitable to encode it as a shell argument. +/// +/// Uses default settings. The only error that can be returned is [`QuoteError::Nul`]. +/// +/// Equivalent to [`Quoter::new().quote(in_str)`](Quoter). +/// +/// (That configuration never returns `Err`, so this function does not panic.) +/// +/// The bytes equivalent is [bytes::try_quote]. +pub fn try_quote(in_str: &str) -> Result, QuoteError> { + Quoter::new().quote(in_str) } #[cfg(test)] @@ -233,17 +284,75 @@ fn test_lineno() { } #[test] +#[cfg_attr(not(feature = "std"), allow(unreachable_code, unused_mut))] fn test_quote() { - assert_eq!(quote("foobar"), "foobar"); - assert_eq!(quote("foo bar"), "\"foo bar\""); - assert_eq!(quote("\""), "\"\\\"\""); - assert_eq!(quote(""), "\"\""); + // This is a list of (unquoted, quoted) pairs. + // But it's using a single long (raw) string literal with an ad-hoc format, just because it's + // hard to read if we have to put the test strings through Rust escaping on top of the escaping + // being tested. (Even raw string literals are noisy for short strings). + // Ad-hoc: "NL" is replaced with a literal newline; no other escape sequences. + let tests = r#" + <> => <''> + => + => <'foo bar'> + <"foo bar'"> => <"\"foo bar'\""> + <'foo bar'> => <"'foo bar'"> + <"> => <'"'> + <"'> => <"\"'"> + => <'hello!world'> + <'hello!world> => <"'hello"'!world'> + <'hello!> => <"'hello"'!'> + => <'hello ''^ world'> + => + => <'!world'"'"> + <{a, b}> => <'{a, b}'> + => <'NL'> + <^> => <'^'> + => + => <'NLx''^'> + => <'NL''^x'> + => <'NL ''^x'> + <{a,b}> => <'{a,b}'> + => <'a,b'> + + <'$> => <"'"'$'> + <"^> => <'"''^'> + "#; + let mut ok = true; + for test in tests.trim().split('\n') { + let parts: Vec = test + .replace("NL", "\n") + .split("=>") + .map(|part| part.trim().trim_start_matches('<').trim_end_matches('>').to_owned()) + .collect(); + assert!(parts.len() == 2); + let unquoted = &*parts[0]; + let quoted_expected = &*parts[1]; + let quoted_actual = try_quote(&parts[0]).unwrap(); + if quoted_expected != quoted_actual { + #[cfg(not(feature = "std"))] + panic!("FAIL: for input <{}>, expected <{}>, got <{}>", + unquoted, quoted_expected, quoted_actual); + #[cfg(feature = "std")] + println!("FAIL: for input <{}>, expected <{}>, got <{}>", + unquoted, quoted_expected, quoted_actual); + ok = false; + } + } + assert!(ok); } #[test] +#[allow(deprecated)] fn test_join() { assert_eq!(join(vec![]), ""); - assert_eq!(join(vec![""]), "\"\""); + assert_eq!(join(vec![""]), "''"); assert_eq!(join(vec!["a", "b"]), "a b"); - assert_eq!(join(vec!["foo bar", "baz"]), "\"foo bar\" baz"); + assert_eq!(join(vec!["foo bar", "baz"]), "'foo bar' baz"); +} + +#[test] +fn test_fallible() { + assert_eq!(try_join(vec!["\0"]), Err(QuoteError::Nul)); + assert_eq!(try_quote("\0"), Err(QuoteError::Nul)); } diff --git a/src/quoting_warning.md b/src/quoting_warning.md new file mode 100644 index 0000000..fab9857 --- /dev/null +++ b/src/quoting_warning.md @@ -0,0 +1,365 @@ +// vim: textwidth=99 +/* +Meta note: This file is loaded as a .rs file by rustdoc only. +*/ +/*! + +A more detailed version of the [warning at the top level](super#warning) about the `quote`/`join` +family of APIs. + +In general, passing the output of these APIs to a shell should recover the original string(s). +This page lists cases where it fails to do so. + +In noninteractive contexts, there are only minor issues. 'Noninteractive' includes shell scripts +and `sh -c` arguments, or even scripts `source`d from interactive shells. The issues are: + +- [Nul bytes](#nul-bytes) + +- [Overlong commands](#overlong-commands) + +If you are writing directly to the stdin of an interactive (`-i`) shell (i.e., if you are +pretending to be a terminal), or if you are writing to a cooked-mode pty (even if the other end is +noninteractive), then there is a **severe** security issue: + +- [Control characters](#control-characters-interactive-contexts-only) + +Finally, there are some [solved issues](#solved-issues). + +# List of issues + +## Nul bytes + +For non-interactive shells, the most problematic input is nul bytes (bytes with value 0). The +non-deprecated functions all default to returning [`QuoteError::Nul`] when encountering them, but +the deprecated [`quote`] and [`join`] functions leave them as-is. + +In Unix, nul bytes can't appear in command arguments, environment variables, or filenames. It's +not a question of proper quoting; they just can't be used at all. This is a consequence of Unix's +system calls all being designed around nul-terminated C strings. + +Shells inherit that limitation. Most of them do not accept nul bytes in strings even internally. +Even when they do, it's pretty much useless or even dangerous, since you can't pass them to +external commands. + +In some cases, you might fail to pass the nul byte to the shell in the first place. For example, +the following code uses [`join`] to tunnel a command over an SSH connection: + +```rust +std::process::Command::new("ssh") + .arg("myhost") + .arg("--") + .arg(join(my_cmd_args)) +``` + +If any argument in `my_cmd_args` contains a nul byte, then `join(my_cmd_args)` will contain a nul +byte. But `join(my_cmd_args)` is itself being passed as an argument to a command (the ssh +command), and command arguments can't contain nul bytes! So this will simply result in the +`Command` failing to launch. + +Still, there are other ways to smuggle nul bytes into a shell. How the shell reacts depends on the +shell and the method of smuggling. For example, here is Bash 5.2.21 exhibiting three different +behaviors: + +- With ANSI-C quoting, the string is truncated at the first nul byte: + ```bash + $ echo $'foo\0bar' | hexdump -C + 00000000 66 6f 6f 0a |foo.| + ``` + +- With command substitution, nul bytes are removed with a warning: + ```bash + $ echo $(printf 'foo\0bar') | hexdump -C + bash: warning: command substitution: ignored null byte in input + 00000000 66 6f 6f 62 61 72 0a |foobar.| + ``` + +- When a nul byte appears directly in a shell script, it's removed with no warning: + ```bash + $ printf 'echo "foo\0bar"' | bash | hexdump -C + 00000000 66 6f 6f 62 61 72 0a |foobar.| + ``` + +Zsh, in contrast, actually allows nul bytes internally, in shell variables and even arguments to +builtin commands. But if a variable is exported to the environment, or if an argument is used for +an external command, then the child process will see it silently truncated at the first nul. This +might actually be more dangerous, depending on the use case. + +## Overlong commands + +If you pass a long string into a shell, several things might happen: + +- It might succeed, yet the shell might have trouble actually doing anything with it. For example: + + ```bash + x=$(printf '%010000000d' 0); /bin/echo $x + bash: /bin/echo: Argument list too long + ``` + +- If you're using certain shells (e.g. Busybox Ash) *and* using a pty for communication, then the + shell will impose a line length limit, ignoring all input past the limit. + +- If you're using a pty in cooked mode, then by default, if you write so many bytes as input that + it fills the kernel's internal buffer, the kernel will simply drop those bytes, instead of + blocking waiting for the shell to empty out the buffer. In other words, random bits of input can + be lost, which is obviously insecure. + +Future versions of this crate may add an option to [`Quoter`] to check the length for you. + +## Control characters (*interactive contexts only*) + +Control characters are the bytes from `\x00` to `\x1f`, plus `\x7f`. `\x00` (the nul byte) is +discussed [above](#nul-bytes), but what about the rest? Well, many of them correspond to terminal +keyboard shortcuts. For example, when you press Ctrl-A at a shell prompt, your terminal sends the +byte `\x01`. The shell sees that byte and (if not configured differently) takes the standard +action for Ctrl-A, which is to move the cursor to the beginning of the line. + +This means that it's quite dangerous to pipe bytes to an interactive shell. For example, here is a +program that tries to tell Bash to echo an arbitrary string, 'safely': +```rust +use std::process::{Command, Stdio}; +use std::io::Write; + +let evil_string = "\x01do_something_evil; "; +let quoted = shlex::try_quote(evil_string).unwrap(); +println!("quoted string is {:?}", quoted); + +let mut bash = Command::new("bash") + .arg("-i") // force interactive mode + .stdin(Stdio::piped()) + .spawn() + .unwrap(); +let stdin = bash.stdin.as_mut().unwrap(); +write!(stdin, "echo {}\n", quoted).unwrap(); +``` + +Here's the output of the program (with irrelevant bits removed): + +```text +quoted string is "'\u{1}do_something_evil; '" +/tmp comex$ do_something_evil; 'echo ' +bash: do_something_evil: command not found +bash: echo : command not found +``` + +Even though we quoted it, Bash still ran an arbitrary command! + +This is not because the quoting was insufficient, per se. In single quotes, all input is supposed +to be treated as raw data until the closing single quote. And in fact, this would work fine +without the `"-i"` argument. + +But line input is a separate stage from shell syntax parsing. After all, if you type a single +quote on the keyboard, you wouldn't expect it to disable all your keyboard shortcuts. So a control +character always has its designated effect, no matter if it's quoted or backslash-escaped. + +Also, some control characters are interpreted by the kernel tty layer instead, like CTRL-C to send +SIGINT. These can be an issue even with noninteractive shells, but only if using a pty for +communication, as opposed to a pipe. + +To be safe, you just have to avoid sending them. + +### Why not just use hex escapes? + +In any normal programming languages, this would be no big deal. + +Any normal language has a way to escape arbitrary characters in strings by writing out their +numeric values. For example, Rust lets you write them in hexadecimal, like `"\x4f"` (or +`"\u{1d546}"` for Unicode). In this way, arbitrary strings can be represented using only 'nice' +simple characters. Any remotely suspicious character can be replaced with a numeric escape +sequence, where the escape sequence itself consists only of alphanumeric characters and some +punctuation. The result may not be the most readable[^choices], but it's quite safe from being +misinterpreted or corrupted in transit. + +Shell is not normal. It has no numeric escape sequences. + +There are a few different ways to quote characters (unquoted, unquoted-with-backslash, single +quotes, double quotes), but all of them involve writing the character itself. If the input +contains a control character, the output must contain that same character. + +### Mitigation: terminal filters + +In practice, automating interactive shells like in the above example is pretty uncommon these days. +In most cases, the only way for a programmatically generated string to make its way to the input of +an interactive shell is if a human copies and pastes it into their terminal. + +And many terminals detect when you paste a string containing control characters. iTerm2 strips +them out; gnome-terminal replaces them with alternate characters[^gr]; Kitty outright prompts for +confirmation. This mitigates the risk. + +But it's not perfect. Some other terminals don't implement this check or implement it incorrectly. +Also, these checks tend to not filter the tab character, which could trigger tab completion. In +most cases that's a non-issue, because most shells support paste bracketing, which disables tab and +some other control characters[^bracketing] within pasted text. But in some cases paste bracketing +gets disabled. + +### Future possibility: ANSI-C quoting + +I said that shell syntax has no numeric escapes, but that only applies to *portable* shell syntax. +Bash and Zsh support an obscure alternate quoting style with the syntax `$'foo'`. It's called +["ANSI-C quoting"][ansic], and inside it you can use all the escape sequences supported by C, +including hex escapes: + +```bash +$ echo $'\x41\n\x42' +A +B +``` + +But other shells don't support it — including Dash, a popular choice for `/bin/sh`, and Busybox's +Ash, frequently seen on stripped-down embedded systems. This crate's quoting functionality [tries +to be compatible](crate#compatibility) with those shells, plus all other POSIX-compatible shells. +That makes ANSI-C quoting a no-go. + +Still, future versions of this crate may provide an option to enable ANSI-C quoting, at the cost of +reduced portability. + +### Future possibility: printf + +Another option would be to invoke the `printf` command, which is required by POSIX to support octal +escapes. For example, you could 'escape' the Rust string `"\x01"` into the shell syntax `"$(printf +'\001')"`. The shell will execute the command `printf` with the first argument being literally a +backslash followed by three digits; `printf` will output the actual byte with value 1; and the +shell will substitute that back into the original command. + +The problem is that 'escaping' a string into a command substitution just feels too surprising. If +nothing else, it only works with an actual shell; [other languages' shell parsing +routines](crate#compatibility) wouldn't understand it. Neither would this crate's own parser, +though that could be fixed. + +Future versions of this crate may provide an option to use `printf` for quoting. + +### Special note: newlines + +Did you know that `\r` and `\n` are control characters? They aren't as dangerous as other control +characters (if quoted properly). But there's still an issue with them in interactive contexts. + +Namely, in some cases, interactive shells and/or the tty layer will 'helpfully' translate between +different line ending conventions. The possibilities include replacing `\r` with `\n`, replacing +`\n` with `\r\n`, and others. This can't result in command injection, but it's still a lossy +transformation which can result in a failure to round-trip (i.e. the shell sees a different string +from what was originally passed to `quote`). + +Numeric escapes would solve this as well. + +# Solved issues + +## Solved: Past vulnerability (GHSA-r7qv-8r2h-pg27 / RUSTSEC-2024-XXX) + +Versions of this crate before 1.3.0 did not quote `{`, `}`, and `\xa0`. + +See: +- +- (TODO: Add Rustsec link) + +## Solved: `!` and `^` + +There are two non-control characters which have a special meaning in interactive contexts only: `!` and +`^`. Luckily, these can be escaped adequately. + +The `!` character triggers [history expansion][he]; the `^` character can trigger a variant of +history expansion known as [Quick Substitution][qs]. Both of these characters get expanded even +inside of double-quoted strings\! + +If we're in a double-quoted string, then we can't just escape these characters with a backslash. +Only a specific set of characters can be backslash-escaped inside double quotes; the set of +supported characters depends on the shell, but it often doesn't include `!` and `^`.[^escbs] +Trying to backslash-escape an unsupported character produces a literal backslash: +```bash +$ echo "\!" +\! +``` + +However, these characters don't get expanded in single-quoted strings, so this crate just +single-quotes them. + +But there's a Bash bug where `^` actually does get partially expanded in single-quoted strings: +```bash +$ echo ' +> ^a^b +> ' + +!!:s^a^b +``` + +To work around that, this crate forces `^` to appear right after an opening single quote. For +example, the string `"^` is quoted into `'"''^'` instead of `'"^'`. This restriction is overkill, +since `^` is only meaningful right after a newline, but it's a sufficient restriction (after all, a +`^` character can't be preceded by a newline if it's forced to be preceded by a single quote), and +for now it simplifies things. + +## Solved: `\xa0` + +The byte `\xa0` may be treated as a shell word separator, specifically on Bash on macOS when using +the default UTF-8 locale, only when the input is invalid UTF-8. This crate handles the issue by +always using quotes for arguments containing this byte. + +In fact, this crate always uses quotes for arguments containing any non-ASCII bytes. This may be +changed in the future, since it's a bit unfriendly to non-English users. But for now it +minimizes risk, especially considering the large number of different legacy single-byte locales +someone might hypothetically be running their shell in. + +### Demonstration + +```bash +$ echo -e 'ls a\xa0b' | bash +ls: a: No such file or directory +ls: b: No such file or directory +``` +The normal behavior would be to output a single line, e.g.: +```bash +$ echo -e 'ls a\xa0b' | bash +ls: cannot access 'a'$'\240''b': No such file or directory +``` +(The specific quoting in the error doesn't matter.) + +### Cause + +Just for fun, here's why this behavior occurs: + +Bash decides which bytes serve as word separators based on the libc function [`isblank`][isblank]. +On macOS on UTF-8 locales, this passes for `\xa0`, corresponding to U+00A0 NO-BREAK SPACE. + +This is doubly unique compared to the other systems I tested (Linux/glibc, Linux/musl, and +Windows/MSVC). First, the other systems don't allow bytes in the range [0x80, 0xFF] to pass +isfoo functions in UTF-8 locales, even if the corresponding Unicode codepoint +does pass, as determined by the wide-character equivalent function, iswfoo. +Second, the other systems don't treat U+00A0 as blank (even using `iswblank`). + +Meanwhile, Bash checks for multi-byte sequences and forbids them from being treated as special +characters, so the proper UTF-8 encoding of U+00A0, `b"\xc2\xa0"`, is not treated as a word +separator. Treatment as a word separator only happens for `b"\xa0"` alone, which is illegal UTF-8. + +[ansic]: https://www.gnu.org/software/bash/manual/html_node/ANSI_002dC-Quoting.html +[he]: https://www.gnu.org/software/bash/manual/html_node/History-Interaction.html +[qs]: https://www.gnu.org/software/bash/manual/html_node/Event-Designators.html +[isblank]: https://man7.org/linux/man-pages/man3/isblank.3p.html +[nul]: #nul-bytes + +[^choices]: This can lead to tough choices over which + characters to escape and which to leave as-is, especially when Unicode gets involved and you + have to balance the risk of confusion with the benefit of properly supporting non-English + languages. +
+
+ We don't have the luxury of those choices. + +[^gr]: For example, backspace (in Unicode lingo, U+0008 BACKSPACE) turns into U+2408 SYMBOL FOR BACKSPACE. + +[^bracketing]: It typically disables almost all handling of control characters by the shell proper, + but one necessary exception is the end-of-paste sequence itself (which starts with the control + character `\x1b`). In addition, paste bracketing does not suppress handling of control + characters by the kernel tty layer, such as `\x03` sending SIGINT (which typically clears the + currently typed command, making it dangerous in a similar way to `\x01`). + +[^escbs]: For example, Dash doesn't remove the backslash from `"\!"` because it simply doesn't know + anything about `!` as a special character: it doesn't support history expansion. On the other + end of the spectrum, Zsh supports history expansion and does remove the backslash — though only + in interactive mode. Bash's behavior is weirder. It supports history expansion, and if you + write `"\!"`, the backslash does prevent history expansion from occurring — but it doesn't get + removed! + +*/ + +// `use` declarations to make auto links work: +use ::{quote, join, Shlex, Quoter, QuoteError}; + +// TODO: add more about copy-paste and human readability. -- cgit v1.2.3