aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJeff Vander Stoep <jeffv@google.com>2024-02-06 12:04:30 +0000
committerAutomerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>2024-02-06 12:04:30 +0000
commit52363e2196b542f4f5af7c50f54d709b0ae9a76f (patch)
tree4b29caddb5fb6ef3d61025c0e200e7447449c8d0
parent35e1d8b055ec3348dae989b958938175ffa067b4 (diff)
parentbce9dd1ed2cdcf01509b6fbf5302ffaa7714e935 (diff)
downloadshlex-emu-34-2-dev.tar.gz
Upgrade shlex to 1.3.0 am: bce9dd1ed2HEADmastermainemu-34-2-dev
Original change: https://android-review.googlesource.com/c/platform/external/rust/crates/shlex/+/2949488 Change-Id: I28dd7465fe47761c8c64bebf1a513117147885df Signed-off-by: Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>
-rw-r--r--.cargo_vcs_info.json7
-rw-r--r--.gitignore9
-rw-r--r--Android.bp4
-rw-r--r--CHANGELOG.md4
-rw-r--r--Cargo.toml27
-rw-r--r--Cargo.toml.orig9
-rw-r--r--METADATA25
-rw-r--r--README.md13
-rw-r--r--src/bytes.rs576
-rw-r--r--src/lib.rs399
-rw-r--r--src/quoting_warning.md365
11 files changed, 1262 insertions, 176 deletions
diff --git a/.cargo_vcs_info.json b/.cargo_vcs_info.json
index 02eb82f..efa0c6e 100644
--- a/.cargo_vcs_info.json
+++ b/.cargo_vcs_info.json
@@ -1,5 +1,6 @@
{
"git": {
- "sha1": "8638f145d9356eed9c83e7b2f13c5209e72f0e27"
- }
-}
+ "sha1": "4a0724b0b62ef715467875b040a890ce75a8a829"
+ },
+ "path_in_vcs": ""
+} \ No newline at end of file
diff --git a/.gitignore b/.gitignore
index cd23ebe..d36a04d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,8 @@
-/target/
-Cargo.lock
+nocommit/
+target/
+artifacts/
+corpus/
+/Cargo.lock
**/*.rs.bk
+.*.sw?
+.sw?
diff --git a/Android.bp b/Android.bp
index ace93db..3ddb79c 100644
--- a/Android.bp
+++ b/Android.bp
@@ -42,7 +42,7 @@ rust_library_host {
name: "libshlex",
crate_name: "shlex",
cargo_env_compat: true,
- cargo_pkg_version: "1.1.0",
+ cargo_pkg_version: "1.3.0",
srcs: ["src/lib.rs"],
edition: "2015",
features: [
@@ -55,7 +55,7 @@ rust_test_host {
name: "shlex_test_src_lib",
crate_name: "shlex",
cargo_env_compat: true,
- cargo_pkg_version: "1.1.0",
+ cargo_pkg_version: "1.3.0",
srcs: ["src/lib.rs"],
test_suites: ["general-tests"],
auto_gen_config: true,
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 50d2e6e..95552b4 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,7 @@
+# 1.2.0
+
+* Adds `bytes` module to support operating directly on byte strings.
+
# 1.1.0
* Adds the `std` feature (enabled by default)
diff --git a/Cargo.toml b/Cargo.toml
index 2741ed8..2b66892 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,19 +3,30 @@
# When uploading crates to the registry Cargo will automatically
# "normalize" Cargo.toml files for maximal compatibility
# with all versions of Cargo and also rewrite `path` dependencies
-# to registry (e.g., crates.io) dependencies
+# to registry (e.g., crates.io) dependencies.
#
-# If you believe there's an error in this file please file an
-# issue against the rust-lang/cargo repository. If you're
-# editing this file be aware that the upstream Cargo.toml
-# will likely look very different (and much more reasonable)
+# If you are reading this file be aware that the original Cargo.toml
+# will likely look very different (and much more reasonable).
+# See Cargo.toml.orig for the original contents.
[package]
+rust-version = "1.46.0"
name = "shlex"
-version = "1.1.0"
-authors = ["comex <comexk@gmail.com>", "Fenhl <fenhl@fenhl.net>"]
+version = "1.3.0"
+authors = [
+ "comex <comexk@gmail.com>",
+ "Fenhl <fenhl@fenhl.net>",
+ "Adrian Taylor <adetaylor@chromium.org>",
+ "Alex Touchet <alextouchet@outlook.com>",
+ "Daniel Parks <dp+git@oxidized.org>",
+ "Garrett Berg <googberg@gmail.com>",
+]
description = "Split a string into shell words, like Python's shlex."
-categories = ["command-line-interface", "parser-implementations"]
+readme = "README.md"
+categories = [
+ "command-line-interface",
+ "parser-implementations",
+]
license = "MIT OR Apache-2.0"
repository = "https://github.com/comex/rust-shlex"
diff --git a/Cargo.toml.orig b/Cargo.toml.orig
index 57fb62b..c3644af 100644
--- a/Cargo.toml.orig
+++ b/Cargo.toml.orig
@@ -1,9 +1,13 @@
[package]
name = "shlex"
-version = "1.1.0"
+version = "1.3.0"
authors = [
"comex <comexk@gmail.com>",
- "Fenhl <fenhl@fenhl.net>"
+ "Fenhl <fenhl@fenhl.net>",
+ "Adrian Taylor <adetaylor@chromium.org>",
+ "Alex Touchet <alextouchet@outlook.com>",
+ "Daniel Parks <dp+git@oxidized.org>",
+ "Garrett Berg <googberg@gmail.com>",
]
license = "MIT OR Apache-2.0"
repository = "https://github.com/comex/rust-shlex"
@@ -12,6 +16,7 @@ categories = [
"command-line-interface",
"parser-implementations"
]
+rust-version = "1.46.0"
[features]
std = []
diff --git a/METADATA b/METADATA
index 9ee61c1..b7ec356 100644
--- a/METADATA
+++ b/METADATA
@@ -1,19 +1,20 @@
+# This project was upgraded with external_updater.
+# Usage: tools/external_updater/updater.sh update external/rust/crates/shlex
+# For more info, check https://cs.android.com/android/platform/superproject/+/main:tools/external_updater/README.md
+
name: "shlex"
description: "Split a string into shell words, like Python\'s shlex."
third_party {
- url {
- type: HOMEPAGE
- value: "https://crates.io/crates/shlex"
- }
- url {
- type: ARCHIVE
- value: "https://static.crates.io/crates/shlex/shlex-1.1.0.crate"
- }
- version: "1.1.0"
license_type: NOTICE
last_upgrade_date {
- year: 2021
- month: 9
- day: 22
+ year: 2024
+ month: 2
+ day: 5
+ }
+ homepage: "https://crates.io/crates/shlex"
+ identifier {
+ type: "Archive"
+ value: "https://static.crates.io/crates/shlex/shlex-1.3.0.crate"
+ version: "1.3.0"
}
}
diff --git a/README.md b/README.md
index 6778828..6400a6f 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,11 @@
+[![ci badge]][ci link] [![crates.io badge]][crates.io link] [![docs.rs badge]][docs.rs link]
+
+[crates.io badge]: https://img.shields.io/crates/v/shlex.svg?style=flat-square
+[crates.io link]: https://crates.io/crates/shlex
+[docs.rs badge]: https://img.shields.io/badge/docs-online-dddddd.svg?style=flat-square
+[docs.rs link]: https://docs.rs/shlex
+[ci badge]: https://img.shields.io/github/actions/workflow/status/comex/rust-shlex/test.yml?branch=master&style=flat-square
+[ci link]: https://github.com/comex/rust-shlex/actions
Same idea as (but implementation not directly based on) the Python shlex
module. However, this implementation does not support any of the Python
@@ -8,8 +16,9 @@ You only get the default settings of shlex.split, which mimic the POSIX shell:
This implementation also deviates from the Python version in not treating \r
specially, which I believe is more compliant.
-The algorithms in this crate are oblivious to UTF-8 high bytes, so they iterate
-over the bytes directly as a micro-optimization.
+This crate can be used on either normal Rust strings, or on byte strings with
+the `bytes` module. The algorithms used are oblivious to UTF-8 high bytes, so
+internally they all work on bytes directly as a micro-optimization.
Disabling the `std` feature (which is enabled by default) will allow the crate
to work in `no_std` environments, where the `alloc` crate, and a global
diff --git a/src/bytes.rs b/src/bytes.rs
new file mode 100644
index 0000000..af8daad
--- /dev/null
+++ b/src/bytes.rs
@@ -0,0 +1,576 @@
+// Copyright 2015 Nicholas Allegra (comex).
+// Licensed under the Apache License, Version 2.0 <https://www.apache.org/licenses/LICENSE-2.0> or
+// the MIT license <https://opensource.org/licenses/MIT>, at your option. This file may not be
+// copied, modified, or distributed except according to those terms.
+
+//! [`Shlex`] and friends for byte strings.
+//!
+//! This is used internally by the [outer module](crate), and may be more
+//! convenient if you are working with byte slices (`[u8]`) or types that are
+//! wrappers around bytes, such as [`OsStr`](std::ffi::OsStr):
+//!
+//! ```rust
+//! #[cfg(unix)] {
+//! use shlex::bytes::quote;
+//! use std::ffi::OsStr;
+//! use std::os::unix::ffi::OsStrExt;
+//!
+//! // `\x80` is invalid in UTF-8.
+//! let os_str = OsStr::from_bytes(b"a\x80b c");
+//! assert_eq!(quote(os_str.as_bytes()), &b"'a\x80b c'"[..]);
+//! }
+//! ```
+//!
+//! (On Windows, `OsStr` uses 16 bit wide characters so this will not work.)
+
+extern crate alloc;
+use alloc::vec::Vec;
+use alloc::borrow::Cow;
+#[cfg(test)]
+use alloc::vec;
+#[cfg(test)]
+use alloc::borrow::ToOwned;
+#[cfg(all(doc, not(doctest)))]
+use crate::{self as shlex, quoting_warning};
+
+use super::QuoteError;
+
+/// An iterator that takes an input byte string and splits it into the words using the same syntax as
+/// the POSIX shell.
+pub struct Shlex<'a> {
+ in_iter: core::slice::Iter<'a, u8>,
+ /// The number of newlines read so far, plus one.
+ pub line_no: usize,
+ /// An input string is erroneous if it ends while inside a quotation or right after an
+ /// unescaped backslash. Since Iterator does not have a mechanism to return an error, if that
+ /// happens, Shlex just throws out the last token, ends the iteration, and sets 'had_error' to
+ /// true; best to check it after you're done iterating.
+ pub had_error: bool,
+}
+
+impl<'a> Shlex<'a> {
+ pub fn new(in_bytes: &'a [u8]) -> Self {
+ Shlex {
+ in_iter: in_bytes.iter(),
+ line_no: 1,
+ had_error: false,
+ }
+ }
+
+ fn parse_word(&mut self, mut ch: u8) -> Option<Vec<u8>> {
+ let mut result: Vec<u8> = Vec::new();
+ loop {
+ match ch as char {
+ '"' => if let Err(()) = self.parse_double(&mut result) {
+ self.had_error = true;
+ return None;
+ },
+ '\'' => if let Err(()) = self.parse_single(&mut result) {
+ self.had_error = true;
+ return None;
+ },
+ '\\' => if let Some(ch2) = self.next_char() {
+ if ch2 != '\n' as u8 { result.push(ch2); }
+ } else {
+ self.had_error = true;
+ return None;
+ },
+ ' ' | '\t' | '\n' => { break; },
+ _ => { result.push(ch as u8); },
+ }
+ if let Some(ch2) = self.next_char() { ch = ch2; } else { break; }
+ }
+ Some(result)
+ }
+
+ fn parse_double(&mut self, result: &mut Vec<u8>) -> Result<(), ()> {
+ loop {
+ if let Some(ch2) = self.next_char() {
+ match ch2 as char {
+ '\\' => {
+ if let Some(ch3) = self.next_char() {
+ match ch3 as char {
+ // \$ => $
+ '$' | '`' | '"' | '\\' => { result.push(ch3); },
+ // \<newline> => nothing
+ '\n' => {},
+ // \x => =x
+ _ => { result.push('\\' as u8); result.push(ch3); }
+ }
+ } else {
+ return Err(());
+ }
+ },
+ '"' => { return Ok(()); },
+ _ => { result.push(ch2); },
+ }
+ } else {
+ return Err(());
+ }
+ }
+ }
+
+ fn parse_single(&mut self, result: &mut Vec<u8>) -> Result<(), ()> {
+ loop {
+ if let Some(ch2) = self.next_char() {
+ match ch2 as char {
+ '\'' => { return Ok(()); },
+ _ => { result.push(ch2); },
+ }
+ } else {
+ return Err(());
+ }
+ }
+ }
+
+ fn next_char(&mut self) -> Option<u8> {
+ let res = self.in_iter.next().copied();
+ if res == Some(b'\n') { self.line_no += 1; }
+ res
+ }
+}
+
+impl<'a> Iterator for Shlex<'a> {
+ type Item = Vec<u8>;
+ fn next(&mut self) -> Option<Self::Item> {
+ if let Some(mut ch) = self.next_char() {
+ // skip initial whitespace
+ loop {
+ match ch as char {
+ ' ' | '\t' | '\n' => {},
+ '#' => {
+ while let Some(ch2) = self.next_char() {
+ if ch2 as char == '\n' { break; }
+ }
+ },
+ _ => { break; }
+ }
+ if let Some(ch2) = self.next_char() { ch = ch2; } else { return None; }
+ }
+ self.parse_word(ch)
+ } else { // no initial character
+ None
+ }
+ }
+
+}
+
+/// Convenience function that consumes the whole byte string at once. Returns None if the input was
+/// erroneous.
+pub fn split(in_bytes: &[u8]) -> Option<Vec<Vec<u8>>> {
+ let mut shl = Shlex::new(in_bytes);
+ let res = shl.by_ref().collect();
+ if shl.had_error { None } else { Some(res) }
+}
+
+/// A more configurable interface to quote strings. If you only want the default settings you can
+/// use the convenience functions [`try_quote`] and [`try_join`].
+///
+/// The string equivalent is [`shlex::Quoter`].
+#[derive(Default, Debug, Clone)]
+pub struct Quoter {
+ allow_nul: bool,
+ // TODO: more options
+}
+
+impl Quoter {
+ /// Create a new [`Quoter`] with default settings.
+ #[inline]
+ pub fn new() -> Self {
+ Self::default()
+ }
+
+ /// Set whether to allow [nul bytes](quoting_warning#nul-bytes). By default they are not
+ /// allowed and will result in an error of [`QuoteError::Nul`].
+ #[inline]
+ pub fn allow_nul(mut self, allow: bool) -> Self {
+ self.allow_nul = allow;
+ self
+ }
+
+ /// Convenience function that consumes an iterable of words and turns it into a single byte string,
+ /// quoting words when necessary. Consecutive words will be separated by a single space.
+ pub fn join<'a, I: IntoIterator<Item = &'a [u8]>>(&self, words: I) -> Result<Vec<u8>, QuoteError> {
+ Ok(words.into_iter()
+ .map(|word| self.quote(word))
+ .collect::<Result<Vec<Cow<[u8]>>, QuoteError>>()?
+ .join(&b' '))
+ }
+
+ /// Given a single word, return a byte string suitable to encode it as a shell argument.
+ ///
+ /// If given valid UTF-8, this will never produce invalid UTF-8. This is because it only
+ /// ever inserts valid ASCII characters before or after existing ASCII characters (or
+ /// returns two single quotes if the input was an empty string). It will never modify a
+ /// multibyte UTF-8 character.
+ pub fn quote<'a>(&self, mut in_bytes: &'a [u8]) -> Result<Cow<'a, [u8]>, QuoteError> {
+ if in_bytes.is_empty() {
+ // Empty string. Special case that isn't meaningful as only part of a word.
+ return Ok(b"''"[..].into());
+ }
+ if !self.allow_nul && in_bytes.iter().any(|&b| b == b'\0') {
+ return Err(QuoteError::Nul);
+ }
+ let mut out: Vec<u8> = Vec::new();
+ while !in_bytes.is_empty() {
+ // Pick a quoting strategy for some prefix of the input. Normally this will cover the
+ // entire input, but in some case we might need to divide the input into multiple chunks
+ // that are quoted differently.
+ let (cur_len, strategy) = quoting_strategy(in_bytes);
+ if cur_len == in_bytes.len() && strategy == QuotingStrategy::Unquoted && out.is_empty() {
+ // Entire string can be represented unquoted. Reuse the allocation.
+ return Ok(in_bytes.into());
+ }
+ let (cur_chunk, rest) = in_bytes.split_at(cur_len);
+ assert!(rest.len() < in_bytes.len()); // no infinite loop
+ in_bytes = rest;
+ append_quoted_chunk(&mut out, cur_chunk, strategy);
+ }
+ Ok(out.into())
+ }
+
+}
+
+#[derive(PartialEq)]
+enum QuotingStrategy {
+ /// No quotes and no backslash escapes. (If backslash escapes would be necessary, we use a
+ /// different strategy instead.)
+ Unquoted,
+ /// Single quoted.
+ SingleQuoted,
+ /// Double quotes, potentially with backslash escapes.
+ DoubleQuoted,
+ // TODO: add $'xxx' and "$(printf 'xxx')" styles
+}
+
+/// Is this ASCII byte okay to emit unquoted?
+const fn unquoted_ok(c: u8) -> bool {
+ match c as char {
+ // Allowed characters:
+ '+' | '-' | '.' | '/' | ':' | '@' | ']' | '_' |
+ '0'..='9' | 'A'..='Z' | 'a'..='z'
+ => true,
+
+ // Non-allowed characters:
+ // From POSIX https://pubs.opengroup.org/onlinepubs/9699919799/utilities/V3_chap02.html
+ // "The application shall quote the following characters if they are to represent themselves:"
+ '|' | '&' | ';' | '<' | '>' | '(' | ')' | '$' | '`' | '\\' | '"' | '\'' | ' ' | '\t' | '\n' |
+ // "and the following may need to be quoted under certain circumstances[..]:"
+ '*' | '?' | '[' | '#' | '~' | '=' | '%' |
+ // Brace expansion. These ought to be in the POSIX list but aren't yet;
+ // see: https://www.austingroupbugs.net/view.php?id=1193
+ '{' | '}' |
+ // Also quote comma, just to be safe in the extremely odd case that the user of this crate
+ // is intentionally placing a quoted string inside a brace expansion, e.g.:
+ // format!("echo foo{{a,b,{}}}" | shlex::quote(some_str))
+ ',' |
+ // '\r' is allowed in a word by all real shells I tested, but is treated as a word
+ // separator by Python `shlex` | and might be translated to '\n' in interactive mode.
+ '\r' |
+ // '!' and '^' are treated specially in interactive mode; see quoting_warning.
+ '!' | '^' |
+ // Nul bytes and control characters.
+ '\x00' ..= '\x1f' | '\x7f'
+ => false,
+ '\u{80}' ..= '\u{10ffff}' => {
+ // This is unreachable since `unquoted_ok` is only called for 0..128.
+ // Non-ASCII bytes are handled separately in `quoting_strategy`.
+ // Can't call unreachable!() from `const fn` on old Rust, so...
+ unquoted_ok(c)
+ },
+ }
+ // Note: The logic cited above for quoting comma might suggest that `..` should also be quoted,
+ // it as a special case of brace expansion). But it's not necessary. There are three cases:
+ //
+ // 1. The user wants comma-based brace expansion, but the untrusted string being `quote`d
+ // contains `..`, so they get something like `{foo,bar,3..5}`.
+ // => That's safe; both Bash and Zsh expand this to `foo bar 3..5` rather than
+ // `foo bar 3 4 5`. The presence of commas disables sequence expression expansion.
+ //
+ // 2. The user wants comma-based brace expansion where the contents of the braces are a
+ // variable number of `quote`d strings and nothing else. There happens to be exactly
+ // one string and it contains `..`, so they get something like `{3..5}`.
+ // => Then this will expand as a sequence expression, which is unintended. But I don't mind,
+ // because any such code is already buggy. Suppose the untrusted string *didn't* contain
+ // `,` or `..`, resulting in shell input like `{foo}`. Then the shell would interpret it
+ // as the literal string `{foo}` rather than brace-expanding it into `foo`.
+ //
+ // 3. The user wants a sequence expression and wants to supply an untrusted string as one of
+ // the endpoints or the increment.
+ // => Well, that's just silly, since the endpoints can only be numbers or single letters.
+}
+
+/// Optimized version of `unquoted_ok`.
+fn unquoted_ok_fast(c: u8) -> bool {
+ const UNQUOTED_OK_MASK: u128 = {
+ // Make a mask of all bytes in 0..<0x80 that pass.
+ let mut c = 0u8;
+ let mut mask = 0u128;
+ while c < 0x80 {
+ if unquoted_ok(c) {
+ mask |= 1u128 << c;
+ }
+ c += 1;
+ }
+ mask
+ };
+ ((UNQUOTED_OK_MASK >> c) & 1) != 0
+}
+
+/// Is this ASCII byte okay to emit in single quotes?
+fn single_quoted_ok(c: u8) -> bool {
+ match c {
+ // No single quotes in single quotes.
+ b'\'' => false,
+ // To work around a Bash bug, ^ is only allowed right after an opening single quote; see
+ // quoting_warning.
+ b'^' => false,
+ // Backslashes in single quotes are literal according to POSIX, but Fish treats them as an
+ // escape character. Ban them. Fish doesn't aim to be POSIX-compatible, but we *can*
+ // achieve Fish compatibility using double quotes, so we might as well.
+ b'\\' => false,
+ _ => true
+ }
+}
+
+/// Is this ASCII byte okay to emit in double quotes?
+fn double_quoted_ok(c: u8) -> bool {
+ match c {
+ // Work around Python `shlex` bug where parsing "\`" and "\$" doesn't strip the
+ // backslash, even though POSIX requires it.
+ b'`' | b'$' => false,
+ // '!' and '^' are treated specially in interactive mode; see quoting_warning.
+ b'!' | b'^' => false,
+ _ => true
+ }
+}
+
+/// Given an input, return a quoting strategy that can cover some prefix of the string, along with
+/// the size of that prefix.
+///
+/// Precondition: input size is nonzero. (Empty strings are handled by the caller.)
+/// Postcondition: returned size is nonzero.
+#[cfg_attr(manual_codegen_check, inline(never))]
+fn quoting_strategy(in_bytes: &[u8]) -> (usize, QuotingStrategy) {
+ const UNQUOTED_OK: u8 = 1;
+ const SINGLE_QUOTED_OK: u8 = 2;
+ const DOUBLE_QUOTED_OK: u8 = 4;
+
+ let mut prev_ok = SINGLE_QUOTED_OK | DOUBLE_QUOTED_OK | UNQUOTED_OK;
+ let mut i = 0;
+
+ if in_bytes[0] == b'^' {
+ // To work around a Bash bug, ^ is only allowed right after an opening single quote; see
+ // quoting_warning.
+ prev_ok = SINGLE_QUOTED_OK;
+ i = 1;
+ }
+
+ while i < in_bytes.len() {
+ let c = in_bytes[i];
+ let mut cur_ok = prev_ok;
+
+ if c >= 0x80 {
+ // Normally, non-ASCII characters shouldn't require quoting, but see quoting_warning.md
+ // about \xa0. For now, just treat all non-ASCII characters as requiring quotes. This
+ // also ensures things are safe in the off-chance that you're in a legacy 8-bit locale that
+ // has additional characters satisfying `isblank`.
+ cur_ok &= !UNQUOTED_OK;
+ } else {
+ if !unquoted_ok_fast(c) {
+ cur_ok &= !UNQUOTED_OK;
+ }
+ if !single_quoted_ok(c){
+ cur_ok &= !SINGLE_QUOTED_OK;
+ }
+ if !double_quoted_ok(c) {
+ cur_ok &= !DOUBLE_QUOTED_OK;
+ }
+ }
+
+ if cur_ok == 0 {
+ // There are no quoting strategies that would work for both the previous characters and
+ // this one. So we have to end the chunk before this character. The caller will call
+ // `quoting_strategy` again to handle the rest of the string.
+ break;
+ }
+
+ prev_ok = cur_ok;
+ i += 1;
+ }
+
+ // Pick the best allowed strategy.
+ let strategy = if prev_ok & UNQUOTED_OK != 0 {
+ QuotingStrategy::Unquoted
+ } else if prev_ok & SINGLE_QUOTED_OK != 0 {
+ QuotingStrategy::SingleQuoted
+ } else if prev_ok & DOUBLE_QUOTED_OK != 0 {
+ QuotingStrategy::DoubleQuoted
+ } else {
+ unreachable!()
+ };
+ debug_assert!(i > 0);
+ (i, strategy)
+}
+
+fn append_quoted_chunk(out: &mut Vec<u8>, cur_chunk: &[u8], strategy: QuotingStrategy) {
+ match strategy {
+ QuotingStrategy::Unquoted => {
+ out.extend_from_slice(cur_chunk);
+ },
+ QuotingStrategy::SingleQuoted => {
+ out.reserve(cur_chunk.len() + 2);
+ out.push(b'\'');
+ out.extend_from_slice(cur_chunk);
+ out.push(b'\'');
+ },
+ QuotingStrategy::DoubleQuoted => {
+ out.reserve(cur_chunk.len() + 2);
+ out.push(b'"');
+ for &c in cur_chunk.into_iter() {
+ if let b'$' | b'`' | b'"' | b'\\' = c {
+ // Add a preceding backslash.
+ // Note: We shouldn't actually get here for $ and ` because they don't pass
+ // `double_quoted_ok`.
+ out.push(b'\\');
+ }
+ // Add the character itself.
+ out.push(c);
+ }
+ out.push(b'"');
+ },
+ }
+}
+
+/// Convenience function that consumes an iterable of words and turns it into a single byte string,
+/// quoting words when necessary. Consecutive words will be separated by a single space.
+///
+/// Uses default settings except that nul bytes are passed through, which [may be
+/// dangerous](quoting_warning#nul-bytes), leading to this function being deprecated.
+///
+/// Equivalent to [`Quoter::new().allow_nul(true).join(words).unwrap()`](Quoter).
+///
+/// (That configuration never returns `Err`, so this function does not panic.)
+///
+/// The string equivalent is [shlex::join].
+#[deprecated(since = "1.3.0", note = "replace with `try_join(words)?` to avoid nul byte danger")]
+pub fn join<'a, I: IntoIterator<Item = &'a [u8]>>(words: I) -> Vec<u8> {
+ Quoter::new().allow_nul(true).join(words).unwrap()
+}
+
+/// Convenience function that consumes an iterable of words and turns it into a single byte string,
+/// quoting words when necessary. Consecutive words will be separated by a single space.
+///
+/// Uses default settings. The only error that can be returned is [`QuoteError::Nul`].
+///
+/// Equivalent to [`Quoter::new().join(words)`](Quoter).
+///
+/// The string equivalent is [shlex::try_join].
+pub fn try_join<'a, I: IntoIterator<Item = &'a [u8]>>(words: I) -> Result<Vec<u8>, QuoteError> {
+ Quoter::new().join(words)
+}
+
+/// Given a single word, return a string suitable to encode it as a shell argument.
+///
+/// Uses default settings except that nul bytes are passed through, which [may be
+/// dangerous](quoting_warning#nul-bytes), leading to this function being deprecated.
+///
+/// Equivalent to [`Quoter::new().allow_nul(true).quote(in_bytes).unwrap()`](Quoter).
+///
+/// (That configuration never returns `Err`, so this function does not panic.)
+///
+/// The string equivalent is [shlex::quote].
+#[deprecated(since = "1.3.0", note = "replace with `try_quote(str)?` to avoid nul byte danger")]
+pub fn quote(in_bytes: &[u8]) -> Cow<[u8]> {
+ Quoter::new().allow_nul(true).quote(in_bytes).unwrap()
+}
+
+/// Given a single word, return a string suitable to encode it as a shell argument.
+///
+/// Uses default settings. The only error that can be returned is [`QuoteError::Nul`].
+///
+/// Equivalent to [`Quoter::new().quote(in_bytes)`](Quoter).
+///
+/// (That configuration never returns `Err`, so this function does not panic.)
+///
+/// The string equivalent is [shlex::try_quote].
+pub fn try_quote(in_bytes: &[u8]) -> Result<Cow<[u8]>, QuoteError> {
+ Quoter::new().quote(in_bytes)
+}
+
+#[cfg(test)]
+const INVALID_UTF8: &[u8] = b"\xa1";
+#[cfg(test)]
+const INVALID_UTF8_SINGLEQUOTED: &[u8] = b"'\xa1'";
+
+#[test]
+#[allow(invalid_from_utf8)]
+fn test_invalid_utf8() {
+ // Check that our test string is actually invalid UTF-8.
+ assert!(core::str::from_utf8(INVALID_UTF8).is_err());
+}
+
+#[cfg(test)]
+static SPLIT_TEST_ITEMS: &'static [(&'static [u8], Option<&'static [&'static [u8]]>)] = &[
+ (b"foo$baz", Some(&[b"foo$baz"])),
+ (b"foo baz", Some(&[b"foo", b"baz"])),
+ (b"foo\"bar\"baz", Some(&[b"foobarbaz"])),
+ (b"foo \"bar\"baz", Some(&[b"foo", b"barbaz"])),
+ (b" foo \nbar", Some(&[b"foo", b"bar"])),
+ (b"foo\\\nbar", Some(&[b"foobar"])),
+ (b"\"foo\\\nbar\"", Some(&[b"foobar"])),
+ (b"'baz\\$b'", Some(&[b"baz\\$b"])),
+ (b"'baz\\\''", None),
+ (b"\\", None),
+ (b"\"\\", None),
+ (b"'\\", None),
+ (b"\"", None),
+ (b"'", None),
+ (b"foo #bar\nbaz", Some(&[b"foo", b"baz"])),
+ (b"foo #bar", Some(&[b"foo"])),
+ (b"foo#bar", Some(&[b"foo#bar"])),
+ (b"foo\"#bar", None),
+ (b"'\\n'", Some(&[b"\\n"])),
+ (b"'\\\\n'", Some(&[b"\\\\n"])),
+ (INVALID_UTF8, Some(&[INVALID_UTF8])),
+];
+
+#[test]
+fn test_split() {
+ for &(input, output) in SPLIT_TEST_ITEMS {
+ assert_eq!(split(input), output.map(|o| o.iter().map(|&x| x.to_owned()).collect()));
+ }
+}
+
+#[test]
+fn test_lineno() {
+ let mut sh = Shlex::new(b"\nfoo\nbar");
+ while let Some(word) = sh.next() {
+ if word == b"bar" {
+ assert_eq!(sh.line_no, 3);
+ }
+ }
+}
+
+#[test]
+#[allow(deprecated)]
+fn test_quote() {
+ // Validate behavior with invalid UTF-8:
+ assert_eq!(quote(INVALID_UTF8), INVALID_UTF8_SINGLEQUOTED);
+ // Replicate a few tests from lib.rs. No need to replicate all of them.
+ assert_eq!(quote(b""), &b"''"[..]);
+ assert_eq!(quote(b"foobar"), &b"foobar"[..]);
+ assert_eq!(quote(b"foo bar"), &b"'foo bar'"[..]);
+ assert_eq!(quote(b"'\""), &b"\"'\\\"\""[..]);
+ assert_eq!(quote(b""), &b"''"[..]);
+}
+
+#[test]
+#[allow(deprecated)]
+fn test_join() {
+ // Validate behavior with invalid UTF-8:
+ assert_eq!(join(vec![INVALID_UTF8]), INVALID_UTF8_SINGLEQUOTED);
+ // Replicate a few tests from lib.rs. No need to replicate all of them.
+ assert_eq!(join(vec![]), &b""[..]);
+ assert_eq!(join(vec![&b""[..]]), b"''");
+}
diff --git a/src/lib.rs b/src/lib.rs
index 31b54bd..aa5c306 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -3,20 +3,37 @@
// the MIT license <https://opensource.org/licenses/MIT>, at your option. This file may not be
// copied, modified, or distributed except according to those terms.
-//! Same idea as (but implementation not directly based on) the Python shlex module. However, this
-//! implementation does not support any of the Python module's customization because it makes
-//! parsing slower and is fairly useless. You only get the default settings of shlex.split, which
-//! mimic the POSIX shell:
-//! <https://pubs.opengroup.org/onlinepubs/9699919799/utilities/V3_chap02.html>
+//! Parse strings like, and escape strings for, POSIX shells.
//!
-//! This implementation also deviates from the Python version in not treating `\r` specially, which
-//! I believe is more compliant.
-//!
-//! The algorithms in this crate are oblivious to UTF-8 high bytes, so they iterate over the bytes
-//! directly as a micro-optimization.
+//! Same idea as (but implementation not directly based on) the Python shlex module.
//!
//! Disabling the `std` feature (which is enabled by default) will allow the crate to work in
//! `no_std` environments, where the `alloc` crate, and a global allocator, are available.
+//!
+//! ## <span style="color:red">Warning</span>
+//!
+//! The [`try_quote`]/[`try_join`] family of APIs does not quote control characters (because they
+//! cannot be quoted portably).
+//!
+//! This is fully safe in noninteractive contexts, like shell scripts and `sh -c` arguments (or
+//! even scripts `source`d from interactive shells).
+//!
+//! But if you are quoting for human consumption, you should keep in mind that ugly inputs produce
+//! ugly outputs (which may not be copy-pastable).
+//!
+//! And if by chance you are piping the output of [`try_quote`]/[`try_join`] directly to the stdin
+//! of an interactive shell, you should stop, because control characters can lead to arbitrary
+//! command injection.
+//!
+//! For more information, and for information about more minor issues, please see [quoting_warning].
+//!
+//! ## Compatibility
+//!
+//! This crate's quoting functionality tries to be compatible with **any POSIX-compatible shell**;
+//! it's tested against `bash`, `zsh`, `dash`, Busybox `ash`, and `mksh`, plus `fish` (which is not
+//! POSIX-compatible but close enough).
+//!
+//! It also aims to be compatible with Python `shlex` and C `wordexp`.
#![cfg_attr(not(feature = "std"), no_std)]
@@ -29,124 +46,45 @@ use alloc::vec;
#[cfg(test)]
use alloc::borrow::ToOwned;
+pub mod bytes;
+#[cfg(all(doc, not(doctest)))]
+#[path = "quoting_warning.md"]
+pub mod quoting_warning;
+
/// An iterator that takes an input string and splits it into the words using the same syntax as
/// the POSIX shell.
-pub struct Shlex<'a> {
- in_iter: core::str::Bytes<'a>,
- /// The number of newlines read so far, plus one.
- pub line_no: usize,
- /// An input string is erroneous if it ends while inside a quotation or right after an
- /// unescaped backslash. Since Iterator does not have a mechanism to return an error, if that
- /// happens, Shlex just throws out the last token, ends the iteration, and sets 'had_error' to
- /// true; best to check it after you're done iterating.
- pub had_error: bool,
-}
+///
+/// See [`bytes::Shlex`].
+pub struct Shlex<'a>(bytes::Shlex<'a>);
impl<'a> Shlex<'a> {
pub fn new(in_str: &'a str) -> Self {
- Shlex {
- in_iter: in_str.bytes(),
- line_no: 1,
- had_error: false,
- }
- }
-
- fn parse_word(&mut self, mut ch: u8) -> Option<String> {
- let mut result: Vec<u8> = Vec::new();
- loop {
- match ch as char {
- '"' => if let Err(()) = self.parse_double(&mut result) {
- self.had_error = true;
- return None;
- },
- '\'' => if let Err(()) = self.parse_single(&mut result) {
- self.had_error = true;
- return None;
- },
- '\\' => if let Some(ch2) = self.next_char() {
- if ch2 != '\n' as u8 { result.push(ch2); }
- } else {
- self.had_error = true;
- return None;
- },
- ' ' | '\t' | '\n' => { break; },
- _ => { result.push(ch as u8); },
- }
- if let Some(ch2) = self.next_char() { ch = ch2; } else { break; }
- }
- unsafe { Some(String::from_utf8_unchecked(result)) }
+ Self(bytes::Shlex::new(in_str.as_bytes()))
}
+}
- fn parse_double(&mut self, result: &mut Vec<u8>) -> Result<(), ()> {
- loop {
- if let Some(ch2) = self.next_char() {
- match ch2 as char {
- '\\' => {
- if let Some(ch3) = self.next_char() {
- match ch3 as char {
- // \$ => $
- '$' | '`' | '"' | '\\' => { result.push(ch3); },
- // \<newline> => nothing
- '\n' => {},
- // \x => =x
- _ => { result.push('\\' as u8); result.push(ch3); }
- }
- } else {
- return Err(());
- }
- },
- '"' => { return Ok(()); },
- _ => { result.push(ch2); },
- }
- } else {
- return Err(());
- }
- }
+impl<'a> Iterator for Shlex<'a> {
+ type Item = String;
+ fn next(&mut self) -> Option<String> {
+ self.0.next().map(|byte_word| {
+ // Safety: given valid UTF-8, bytes::Shlex will always return valid UTF-8.
+ unsafe { String::from_utf8_unchecked(byte_word) }
+ })
}
+}
- fn parse_single(&mut self, result: &mut Vec<u8>) -> Result<(), ()> {
- loop {
- if let Some(ch2) = self.next_char() {
- match ch2 as char {
- '\'' => { return Ok(()); },
- _ => { result.push(ch2); },
- }
- } else {
- return Err(());
- }
- }
- }
+impl<'a> core::ops::Deref for Shlex<'a> {
+ type Target = bytes::Shlex<'a>;
- fn next_char(&mut self) -> Option<u8> {
- let res = self.in_iter.next();
- if res == Some('\n' as u8) { self.line_no += 1; }
- res
+ fn deref(&self) -> &Self::Target {
+ &self.0
}
}
-impl<'a> Iterator for Shlex<'a> {
- type Item = String;
- fn next(&mut self) -> Option<String> {
- if let Some(mut ch) = self.next_char() {
- // skip initial whitespace
- loop {
- match ch as char {
- ' ' | '\t' | '\n' => {},
- '#' => {
- while let Some(ch2) = self.next_char() {
- if ch2 as char == '\n' { break; }
- }
- },
- _ => { break; }
- }
- if let Some(ch2) = self.next_char() { ch = ch2; } else { return None; }
- }
- self.parse_word(ch)
- } else { // no initial character
- None
- }
+impl<'a> core::ops::DerefMut for Shlex<'a> {
+ fn deref_mut(&mut self) -> &mut Self::Target {
+ &mut self.0
}
-
}
/// Convenience function that consumes the whole string at once. Returns None if the input was
@@ -157,38 +95,151 @@ pub fn split(in_str: &str) -> Option<Vec<String>> {
if shl.had_error { None } else { Some(res) }
}
-/// Given a single word, return a string suitable to encode it as a shell argument.
-pub fn quote(in_str: &str) -> Cow<str> {
- if in_str.len() == 0 {
- "\"\"".into()
- } else if in_str.bytes().any(|c| match c as char {
- '|' | '&' | ';' | '<' | '>' | '(' | ')' | '$' | '`' | '\\' | '"' | '\'' | ' ' | '\t' |
- '\r' | '\n' | '*' | '?' | '[' | '#' | '~' | '=' | '%' => true,
- _ => false
- }) {
- let mut out: Vec<u8> = Vec::new();
- out.push('"' as u8);
- for c in in_str.bytes() {
- match c as char {
- '$' | '`' | '"' | '\\' => out.push('\\' as u8),
- _ => ()
- }
- out.push(c);
+/// Errors from [`Quoter::quote`], [`Quoter::join`], etc. (and their [`bytes`] counterparts).
+///
+/// By default, the only error that can be returned is [`QuoteError::Nul`]. If you call
+/// `allow_nul(true)`, then no errors can be returned at all. Any error variants added in the
+/// future will not be enabled by default; they will be enabled through corresponding non-default
+/// [`Quoter`] options.
+///
+/// ...In theory. In the unlikely event that additional classes of inputs are discovered that,
+/// like nul bytes, are fundamentally unsafe to quote even for non-interactive shells, the risk
+/// will be mitigated by adding corresponding [`QuoteError`] variants that *are* enabled by
+/// default.
+#[non_exhaustive]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub enum QuoteError {
+ /// The input contained a nul byte. In most cases, shells fundamentally [cannot handle strings
+ /// containing nul bytes](quoting_warning#nul-bytes), no matter how they are quoted. But if
+ /// you're sure you can handle nul bytes, you can call `allow_nul(true)` on the `Quoter` to let
+ /// them pass through.
+ Nul,
+}
+
+impl core::fmt::Display for QuoteError {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ match self {
+ QuoteError::Nul => f.write_str("cannot shell-quote string containing nul byte"),
}
- out.push('"' as u8);
- unsafe { String::from_utf8_unchecked(out) }.into()
- } else {
- in_str.into()
+ }
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for QuoteError {}
+
+/// A more configurable interface to quote strings. If you only want the default settings you can
+/// use the convenience functions [`try_quote`] and [`try_join`].
+///
+/// The bytes equivalent is [`bytes::Quoter`].
+#[derive(Default, Debug, Clone)]
+pub struct Quoter {
+ inner: bytes::Quoter,
+}
+
+impl Quoter {
+ /// Create a new [`Quoter`] with default settings.
+ #[inline]
+ pub fn new() -> Self {
+ Self::default()
+ }
+
+ /// Set whether to allow [nul bytes](quoting_warning#nul-bytes). By default they are not
+ /// allowed and will result in an error of [`QuoteError::Nul`].
+ #[inline]
+ pub fn allow_nul(mut self, allow: bool) -> Self {
+ self.inner = self.inner.allow_nul(allow);
+ self
+ }
+
+ /// Convenience function that consumes an iterable of words and turns it into a single string,
+ /// quoting words when necessary. Consecutive words will be separated by a single space.
+ pub fn join<'a, I: IntoIterator<Item = &'a str>>(&self, words: I) -> Result<String, QuoteError> {
+ // Safety: given valid UTF-8, bytes::join() will always return valid UTF-8.
+ self.inner.join(words.into_iter().map(|s| s.as_bytes()))
+ .map(|bytes| unsafe { String::from_utf8_unchecked(bytes) })
+ }
+
+ /// Given a single word, return a string suitable to encode it as a shell argument.
+ pub fn quote<'a>(&self, in_str: &'a str) -> Result<Cow<'a, str>, QuoteError> {
+ Ok(match self.inner.quote(in_str.as_bytes())? {
+ Cow::Borrowed(out) => {
+ // Safety: given valid UTF-8, bytes::quote() will always return valid UTF-8.
+ unsafe { core::str::from_utf8_unchecked(out) }.into()
+ }
+ Cow::Owned(out) => {
+ // Safety: given valid UTF-8, bytes::quote() will always return valid UTF-8.
+ unsafe { String::from_utf8_unchecked(out) }.into()
+ }
+ })
+ }
+}
+
+impl From<bytes::Quoter> for Quoter {
+ fn from(inner: bytes::Quoter) -> Quoter {
+ Quoter { inner }
+ }
+}
+
+impl From<Quoter> for bytes::Quoter {
+ fn from(quoter: Quoter) -> bytes::Quoter {
+ quoter.inner
}
}
/// Convenience function that consumes an iterable of words and turns it into a single string,
/// quoting words when necessary. Consecutive words will be separated by a single space.
+///
+/// Uses default settings except that nul bytes are passed through, which [may be
+/// dangerous](quoting_warning#nul-bytes), leading to this function being deprecated.
+///
+/// Equivalent to [`Quoter::new().allow_nul(true).join(words).unwrap()`](Quoter).
+///
+/// (That configuration never returns `Err`, so this function does not panic.)
+///
+/// The bytes equivalent is [bytes::join].
+#[deprecated(since = "1.3.0", note = "replace with `try_join(words)?` to avoid nul byte danger")]
pub fn join<'a, I: IntoIterator<Item = &'a str>>(words: I) -> String {
- words.into_iter()
- .map(quote)
- .collect::<Vec<_>>()
- .join(" ")
+ Quoter::new().allow_nul(true).join(words).unwrap()
+}
+
+/// Convenience function that consumes an iterable of words and turns it into a single string,
+/// quoting words when necessary. Consecutive words will be separated by a single space.
+///
+/// Uses default settings. The only error that can be returned is [`QuoteError::Nul`].
+///
+/// Equivalent to [`Quoter::new().join(words)`](Quoter).
+///
+/// The bytes equivalent is [bytes::try_join].
+pub fn try_join<'a, I: IntoIterator<Item = &'a str>>(words: I) -> Result<String, QuoteError> {
+ Quoter::new().join(words)
+}
+
+/// Given a single word, return a string suitable to encode it as a shell argument.
+///
+/// Uses default settings except that nul bytes are passed through, which [may be
+/// dangerous](quoting_warning#nul-bytes), leading to this function being deprecated.
+///
+/// Equivalent to [`Quoter::new().allow_nul(true).quote(in_str).unwrap()`](Quoter).
+///
+/// (That configuration never returns `Err`, so this function does not panic.)
+///
+/// The bytes equivalent is [bytes::quote].
+#[deprecated(since = "1.3.0", note = "replace with `try_quote(str)?` to avoid nul byte danger")]
+pub fn quote(in_str: &str) -> Cow<str> {
+ Quoter::new().allow_nul(true).quote(in_str).unwrap()
+}
+
+/// Given a single word, return a string suitable to encode it as a shell argument.
+///
+/// Uses default settings. The only error that can be returned is [`QuoteError::Nul`].
+///
+/// Equivalent to [`Quoter::new().quote(in_str)`](Quoter).
+///
+/// (That configuration never returns `Err`, so this function does not panic.)
+///
+/// The bytes equivalent is [bytes::try_quote].
+pub fn try_quote(in_str: &str) -> Result<Cow<str>, QuoteError> {
+ Quoter::new().quote(in_str)
}
#[cfg(test)]
@@ -233,17 +284,75 @@ fn test_lineno() {
}
#[test]
+#[cfg_attr(not(feature = "std"), allow(unreachable_code, unused_mut))]
fn test_quote() {
- assert_eq!(quote("foobar"), "foobar");
- assert_eq!(quote("foo bar"), "\"foo bar\"");
- assert_eq!(quote("\""), "\"\\\"\"");
- assert_eq!(quote(""), "\"\"");
+ // This is a list of (unquoted, quoted) pairs.
+ // But it's using a single long (raw) string literal with an ad-hoc format, just because it's
+ // hard to read if we have to put the test strings through Rust escaping on top of the escaping
+ // being tested. (Even raw string literals are noisy for short strings).
+ // Ad-hoc: "NL" is replaced with a literal newline; no other escape sequences.
+ let tests = r#"
+ <> => <''>
+ <foobar> => <foobar>
+ <foo bar> => <'foo bar'>
+ <"foo bar'"> => <"\"foo bar'\"">
+ <'foo bar'> => <"'foo bar'">
+ <"> => <'"'>
+ <"'> => <"\"'">
+ <hello!world> => <'hello!world'>
+ <'hello!world> => <"'hello"'!world'>
+ <'hello!> => <"'hello"'!'>
+ <hello ^ world> => <'hello ''^ world'>
+ <hello^> => <hello'^'>
+ <!world'> => <'!world'"'">
+ <{a, b}> => <'{a, b}'>
+ <NL> => <'NL'>
+ <^> => <'^'>
+ <foo^bar> => <foo'^bar'>
+ <NLx^> => <'NLx''^'>
+ <NL^x> => <'NL''^x'>
+ <NL ^x> => <'NL ''^x'>
+ <{a,b}> => <'{a,b}'>
+ <a,b> => <'a,b'>
+ <a..b => <a..b>
+ <'$> => <"'"'$'>
+ <"^> => <'"''^'>
+ "#;
+ let mut ok = true;
+ for test in tests.trim().split('\n') {
+ let parts: Vec<String> = test
+ .replace("NL", "\n")
+ .split("=>")
+ .map(|part| part.trim().trim_start_matches('<').trim_end_matches('>').to_owned())
+ .collect();
+ assert!(parts.len() == 2);
+ let unquoted = &*parts[0];
+ let quoted_expected = &*parts[1];
+ let quoted_actual = try_quote(&parts[0]).unwrap();
+ if quoted_expected != quoted_actual {
+ #[cfg(not(feature = "std"))]
+ panic!("FAIL: for input <{}>, expected <{}>, got <{}>",
+ unquoted, quoted_expected, quoted_actual);
+ #[cfg(feature = "std")]
+ println!("FAIL: for input <{}>, expected <{}>, got <{}>",
+ unquoted, quoted_expected, quoted_actual);
+ ok = false;
+ }
+ }
+ assert!(ok);
}
#[test]
+#[allow(deprecated)]
fn test_join() {
assert_eq!(join(vec![]), "");
- assert_eq!(join(vec![""]), "\"\"");
+ assert_eq!(join(vec![""]), "''");
assert_eq!(join(vec!["a", "b"]), "a b");
- assert_eq!(join(vec!["foo bar", "baz"]), "\"foo bar\" baz");
+ assert_eq!(join(vec!["foo bar", "baz"]), "'foo bar' baz");
+}
+
+#[test]
+fn test_fallible() {
+ assert_eq!(try_join(vec!["\0"]), Err(QuoteError::Nul));
+ assert_eq!(try_quote("\0"), Err(QuoteError::Nul));
}
diff --git a/src/quoting_warning.md b/src/quoting_warning.md
new file mode 100644
index 0000000..fab9857
--- /dev/null
+++ b/src/quoting_warning.md
@@ -0,0 +1,365 @@
+// vim: textwidth=99
+/*
+Meta note: This file is loaded as a .rs file by rustdoc only.
+*/
+/*!
+
+A more detailed version of the [warning at the top level](super#warning) about the `quote`/`join`
+family of APIs.
+
+In general, passing the output of these APIs to a shell should recover the original string(s).
+This page lists cases where it fails to do so.
+
+In noninteractive contexts, there are only minor issues. 'Noninteractive' includes shell scripts
+and `sh -c` arguments, or even scripts `source`d from interactive shells. The issues are:
+
+- [Nul bytes](#nul-bytes)
+
+- [Overlong commands](#overlong-commands)
+
+If you are writing directly to the stdin of an interactive (`-i`) shell (i.e., if you are
+pretending to be a terminal), or if you are writing to a cooked-mode pty (even if the other end is
+noninteractive), then there is a **severe** security issue:
+
+- [Control characters](#control-characters-interactive-contexts-only)
+
+Finally, there are some [solved issues](#solved-issues).
+
+# List of issues
+
+## Nul bytes
+
+For non-interactive shells, the most problematic input is nul bytes (bytes with value 0). The
+non-deprecated functions all default to returning [`QuoteError::Nul`] when encountering them, but
+the deprecated [`quote`] and [`join`] functions leave them as-is.
+
+In Unix, nul bytes can't appear in command arguments, environment variables, or filenames. It's
+not a question of proper quoting; they just can't be used at all. This is a consequence of Unix's
+system calls all being designed around nul-terminated C strings.
+
+Shells inherit that limitation. Most of them do not accept nul bytes in strings even internally.
+Even when they do, it's pretty much useless or even dangerous, since you can't pass them to
+external commands.
+
+In some cases, you might fail to pass the nul byte to the shell in the first place. For example,
+the following code uses [`join`] to tunnel a command over an SSH connection:
+
+```rust
+std::process::Command::new("ssh")
+ .arg("myhost")
+ .arg("--")
+ .arg(join(my_cmd_args))
+```
+
+If any argument in `my_cmd_args` contains a nul byte, then `join(my_cmd_args)` will contain a nul
+byte. But `join(my_cmd_args)` is itself being passed as an argument to a command (the ssh
+command), and command arguments can't contain nul bytes! So this will simply result in the
+`Command` failing to launch.
+
+Still, there are other ways to smuggle nul bytes into a shell. How the shell reacts depends on the
+shell and the method of smuggling. For example, here is Bash 5.2.21 exhibiting three different
+behaviors:
+
+- With ANSI-C quoting, the string is truncated at the first nul byte:
+ ```bash
+ $ echo $'foo\0bar' | hexdump -C
+ 00000000 66 6f 6f 0a |foo.|
+ ```
+
+- With command substitution, nul bytes are removed with a warning:
+ ```bash
+ $ echo $(printf 'foo\0bar') | hexdump -C
+ bash: warning: command substitution: ignored null byte in input
+ 00000000 66 6f 6f 62 61 72 0a |foobar.|
+ ```
+
+- When a nul byte appears directly in a shell script, it's removed with no warning:
+ ```bash
+ $ printf 'echo "foo\0bar"' | bash | hexdump -C
+ 00000000 66 6f 6f 62 61 72 0a |foobar.|
+ ```
+
+Zsh, in contrast, actually allows nul bytes internally, in shell variables and even arguments to
+builtin commands. But if a variable is exported to the environment, or if an argument is used for
+an external command, then the child process will see it silently truncated at the first nul. This
+might actually be more dangerous, depending on the use case.
+
+## Overlong commands
+
+If you pass a long string into a shell, several things might happen:
+
+- It might succeed, yet the shell might have trouble actually doing anything with it. For example:
+
+ ```bash
+ x=$(printf '%010000000d' 0); /bin/echo $x
+ bash: /bin/echo: Argument list too long
+ ```
+
+- If you're using certain shells (e.g. Busybox Ash) *and* using a pty for communication, then the
+ shell will impose a line length limit, ignoring all input past the limit.
+
+- If you're using a pty in cooked mode, then by default, if you write so many bytes as input that
+ it fills the kernel's internal buffer, the kernel will simply drop those bytes, instead of
+ blocking waiting for the shell to empty out the buffer. In other words, random bits of input can
+ be lost, which is obviously insecure.
+
+Future versions of this crate may add an option to [`Quoter`] to check the length for you.
+
+## Control characters (*interactive contexts only*)
+
+Control characters are the bytes from `\x00` to `\x1f`, plus `\x7f`. `\x00` (the nul byte) is
+discussed [above](#nul-bytes), but what about the rest? Well, many of them correspond to terminal
+keyboard shortcuts. For example, when you press Ctrl-A at a shell prompt, your terminal sends the
+byte `\x01`. The shell sees that byte and (if not configured differently) takes the standard
+action for Ctrl-A, which is to move the cursor to the beginning of the line.
+
+This means that it's quite dangerous to pipe bytes to an interactive shell. For example, here is a
+program that tries to tell Bash to echo an arbitrary string, 'safely':
+```rust
+use std::process::{Command, Stdio};
+use std::io::Write;
+
+let evil_string = "\x01do_something_evil; ";
+let quoted = shlex::try_quote(evil_string).unwrap();
+println!("quoted string is {:?}", quoted);
+
+let mut bash = Command::new("bash")
+ .arg("-i") // force interactive mode
+ .stdin(Stdio::piped())
+ .spawn()
+ .unwrap();
+let stdin = bash.stdin.as_mut().unwrap();
+write!(stdin, "echo {}\n", quoted).unwrap();
+```
+
+Here's the output of the program (with irrelevant bits removed):
+
+```text
+quoted string is "'\u{1}do_something_evil; '"
+/tmp comex$ do_something_evil; 'echo '
+bash: do_something_evil: command not found
+bash: echo : command not found
+```
+
+Even though we quoted it, Bash still ran an arbitrary command!
+
+This is not because the quoting was insufficient, per se. In single quotes, all input is supposed
+to be treated as raw data until the closing single quote. And in fact, this would work fine
+without the `"-i"` argument.
+
+But line input is a separate stage from shell syntax parsing. After all, if you type a single
+quote on the keyboard, you wouldn't expect it to disable all your keyboard shortcuts. So a control
+character always has its designated effect, no matter if it's quoted or backslash-escaped.
+
+Also, some control characters are interpreted by the kernel tty layer instead, like CTRL-C to send
+SIGINT. These can be an issue even with noninteractive shells, but only if using a pty for
+communication, as opposed to a pipe.
+
+To be safe, you just have to avoid sending them.
+
+### Why not just use hex escapes?
+
+In any normal programming languages, this would be no big deal.
+
+Any normal language has a way to escape arbitrary characters in strings by writing out their
+numeric values. For example, Rust lets you write them in hexadecimal, like `"\x4f"` (or
+`"\u{1d546}"` for Unicode). In this way, arbitrary strings can be represented using only 'nice'
+simple characters. Any remotely suspicious character can be replaced with a numeric escape
+sequence, where the escape sequence itself consists only of alphanumeric characters and some
+punctuation. The result may not be the most readable[^choices], but it's quite safe from being
+misinterpreted or corrupted in transit.
+
+Shell is not normal. It has no numeric escape sequences.
+
+There are a few different ways to quote characters (unquoted, unquoted-with-backslash, single
+quotes, double quotes), but all of them involve writing the character itself. If the input
+contains a control character, the output must contain that same character.
+
+### Mitigation: terminal filters
+
+In practice, automating interactive shells like in the above example is pretty uncommon these days.
+In most cases, the only way for a programmatically generated string to make its way to the input of
+an interactive shell is if a human copies and pastes it into their terminal.
+
+And many terminals detect when you paste a string containing control characters. iTerm2 strips
+them out; gnome-terminal replaces them with alternate characters[^gr]; Kitty outright prompts for
+confirmation. This mitigates the risk.
+
+But it's not perfect. Some other terminals don't implement this check or implement it incorrectly.
+Also, these checks tend to not filter the tab character, which could trigger tab completion. In
+most cases that's a non-issue, because most shells support paste bracketing, which disables tab and
+some other control characters[^bracketing] within pasted text. But in some cases paste bracketing
+gets disabled.
+
+### Future possibility: ANSI-C quoting
+
+I said that shell syntax has no numeric escapes, but that only applies to *portable* shell syntax.
+Bash and Zsh support an obscure alternate quoting style with the syntax `$'foo'`. It's called
+["ANSI-C quoting"][ansic], and inside it you can use all the escape sequences supported by C,
+including hex escapes:
+
+```bash
+$ echo $'\x41\n\x42'
+A
+B
+```
+
+But other shells don't support it — including Dash, a popular choice for `/bin/sh`, and Busybox's
+Ash, frequently seen on stripped-down embedded systems. This crate's quoting functionality [tries
+to be compatible](crate#compatibility) with those shells, plus all other POSIX-compatible shells.
+That makes ANSI-C quoting a no-go.
+
+Still, future versions of this crate may provide an option to enable ANSI-C quoting, at the cost of
+reduced portability.
+
+### Future possibility: printf
+
+Another option would be to invoke the `printf` command, which is required by POSIX to support octal
+escapes. For example, you could 'escape' the Rust string `"\x01"` into the shell syntax `"$(printf
+'\001')"`. The shell will execute the command `printf` with the first argument being literally a
+backslash followed by three digits; `printf` will output the actual byte with value 1; and the
+shell will substitute that back into the original command.
+
+The problem is that 'escaping' a string into a command substitution just feels too surprising. If
+nothing else, it only works with an actual shell; [other languages' shell parsing
+routines](crate#compatibility) wouldn't understand it. Neither would this crate's own parser,
+though that could be fixed.
+
+Future versions of this crate may provide an option to use `printf` for quoting.
+
+### Special note: newlines
+
+Did you know that `\r` and `\n` are control characters? They aren't as dangerous as other control
+characters (if quoted properly). But there's still an issue with them in interactive contexts.
+
+Namely, in some cases, interactive shells and/or the tty layer will 'helpfully' translate between
+different line ending conventions. The possibilities include replacing `\r` with `\n`, replacing
+`\n` with `\r\n`, and others. This can't result in command injection, but it's still a lossy
+transformation which can result in a failure to round-trip (i.e. the shell sees a different string
+from what was originally passed to `quote`).
+
+Numeric escapes would solve this as well.
+
+# Solved issues
+
+## Solved: Past vulnerability (GHSA-r7qv-8r2h-pg27 / RUSTSEC-2024-XXX)
+
+Versions of this crate before 1.3.0 did not quote `{`, `}`, and `\xa0`.
+
+See:
+- <https://github.com/advisories/GHSA-r7qv-8r2h-pg27>
+- (TODO: Add Rustsec link)
+
+## Solved: `!` and `^`
+
+There are two non-control characters which have a special meaning in interactive contexts only: `!` and
+`^`. Luckily, these can be escaped adequately.
+
+The `!` character triggers [history expansion][he]; the `^` character can trigger a variant of
+history expansion known as [Quick Substitution][qs]. Both of these characters get expanded even
+inside of double-quoted strings\!
+
+If we're in a double-quoted string, then we can't just escape these characters with a backslash.
+Only a specific set of characters can be backslash-escaped inside double quotes; the set of
+supported characters depends on the shell, but it often doesn't include `!` and `^`.[^escbs]
+Trying to backslash-escape an unsupported character produces a literal backslash:
+```bash
+$ echo "\!"
+\!
+```
+
+However, these characters don't get expanded in single-quoted strings, so this crate just
+single-quotes them.
+
+But there's a Bash bug where `^` actually does get partially expanded in single-quoted strings:
+```bash
+$ echo '
+> ^a^b
+> '
+
+!!:s^a^b
+```
+
+To work around that, this crate forces `^` to appear right after an opening single quote. For
+example, the string `"^` is quoted into `'"''^'` instead of `'"^'`. This restriction is overkill,
+since `^` is only meaningful right after a newline, but it's a sufficient restriction (after all, a
+`^` character can't be preceded by a newline if it's forced to be preceded by a single quote), and
+for now it simplifies things.
+
+## Solved: `\xa0`
+
+The byte `\xa0` may be treated as a shell word separator, specifically on Bash on macOS when using
+the default UTF-8 locale, only when the input is invalid UTF-8. This crate handles the issue by
+always using quotes for arguments containing this byte.
+
+In fact, this crate always uses quotes for arguments containing any non-ASCII bytes. This may be
+changed in the future, since it's a bit unfriendly to non-English users. But for now it
+minimizes risk, especially considering the large number of different legacy single-byte locales
+someone might hypothetically be running their shell in.
+
+### Demonstration
+
+```bash
+$ echo -e 'ls a\xa0b' | bash
+ls: a: No such file or directory
+ls: b: No such file or directory
+```
+The normal behavior would be to output a single line, e.g.:
+```bash
+$ echo -e 'ls a\xa0b' | bash
+ls: cannot access 'a'$'\240''b': No such file or directory
+```
+(The specific quoting in the error doesn't matter.)
+
+### Cause
+
+Just for fun, here's why this behavior occurs:
+
+Bash decides which bytes serve as word separators based on the libc function [`isblank`][isblank].
+On macOS on UTF-8 locales, this passes for `\xa0`, corresponding to U+00A0 NO-BREAK SPACE.
+
+This is doubly unique compared to the other systems I tested (Linux/glibc, Linux/musl, and
+Windows/MSVC). First, the other systems don't allow bytes in the range [0x80, 0xFF] to pass
+<code>is<i>foo</i></code> functions in UTF-8 locales, even if the corresponding Unicode codepoint
+does pass, as determined by the wide-character equivalent function, <code>isw<i>foo</i></code>.
+Second, the other systems don't treat U+00A0 as blank (even using `iswblank`).
+
+Meanwhile, Bash checks for multi-byte sequences and forbids them from being treated as special
+characters, so the proper UTF-8 encoding of U+00A0, `b"\xc2\xa0"`, is not treated as a word
+separator. Treatment as a word separator only happens for `b"\xa0"` alone, which is illegal UTF-8.
+
+[ansic]: https://www.gnu.org/software/bash/manual/html_node/ANSI_002dC-Quoting.html
+[he]: https://www.gnu.org/software/bash/manual/html_node/History-Interaction.html
+[qs]: https://www.gnu.org/software/bash/manual/html_node/Event-Designators.html
+[isblank]: https://man7.org/linux/man-pages/man3/isblank.3p.html
+[nul]: #nul-bytes
+
+[^choices]: This can lead to tough choices over which
+ characters to escape and which to leave as-is, especially when Unicode gets involved and you
+ have to balance the risk of confusion with the benefit of properly supporting non-English
+ languages.
+ <br>
+ <br>
+ We don't have the luxury of those choices.
+
+[^gr]: For example, backspace (in Unicode lingo, U+0008 BACKSPACE) turns into U+2408 SYMBOL FOR BACKSPACE.
+
+[^bracketing]: It typically disables almost all handling of control characters by the shell proper,
+ but one necessary exception is the end-of-paste sequence itself (which starts with the control
+ character `\x1b`). In addition, paste bracketing does not suppress handling of control
+ characters by the kernel tty layer, such as `\x03` sending SIGINT (which typically clears the
+ currently typed command, making it dangerous in a similar way to `\x01`).
+
+[^escbs]: For example, Dash doesn't remove the backslash from `"\!"` because it simply doesn't know
+ anything about `!` as a special character: it doesn't support history expansion. On the other
+ end of the spectrum, Zsh supports history expansion and does remove the backslash — though only
+ in interactive mode. Bash's behavior is weirder. It supports history expansion, and if you
+ write `"\!"`, the backslash does prevent history expansion from occurring — but it doesn't get
+ removed!
+
+*/
+
+// `use` declarations to make auto links work:
+use ::{quote, join, Shlex, Quoter, QuoteError};
+
+// TODO: add more about copy-paste and human readability.