Upgrade shlex to 1.3.0 am: bce9dd1ed2HEAD master main emu-34-2-dev

Original change: https://android-review.googlesource.com/c/platform/external/rust/crates/shlex/+/2949488 Change-Id: I28dd7465fe47761c8c64bebf1a513117147885df Signed-off-by: Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>
author: Jeff Vander Stoep <jeffv@google.com> 2024-02-06 12:04:30 +0000
committer: Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com> 2024-02-06 12:04:30 +0000
commit: 52363e2196b542f4f5af7c50f54d709b0ae9a76f (patch)
tree: 4b29caddb5fb6ef3d61025c0e200e7447449c8d0
parent: 35e1d8b055ec3348dae989b958938175ffa067b4 (diff)
parent: bce9dd1ed2cdcf01509b6fbf5302ffaa7714e935 (diff)
download: shlex-emu-34-2-dev.tar.gz
11 files changed, 1262 insertions, 176 deletions
diff --git a/.cargo_vcs_info.json b/.cargo_vcs_info.json
index 02eb82f..efa0c6e 100644
--- a/.cargo_vcs_info.json
+++ b/.cargo_vcs_info.json
@@ -1,5 +1,6 @@
 {
   "git": {
-    "sha1": "8638f145d9356eed9c83e7b2f13c5209e72f0e27"
-  }
-}
+    "sha1": "4a0724b0b62ef715467875b040a890ce75a8a829"
+  },
+  "path_in_vcs": ""
+}
+\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index cd23ebe..d36a04d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,8 @@
-/target/
-Cargo.lock
+nocommit/
+target/
+artifacts/
+corpus/
+/Cargo.lock
 **/*.rs.bk
+.*.sw?
+.sw?
diff --git a/Android.bp b/Android.bp
index ace93db..3ddb79c 100644
--- a/Android.bp
+++ b/Android.bp
@@ -42,7 +42,7 @@ rust_library_host {
     name: "libshlex",
     crate_name: "shlex",
     cargo_env_compat: true,
-    cargo_pkg_version: "1.1.0",
+    cargo_pkg_version: "1.3.0",
     srcs: ["src/lib.rs"],
     edition: "2015",
     features: [
@@ -55,7 +55,7 @@ rust_test_host {
     name: "shlex_test_src_lib",
     crate_name: "shlex",
     cargo_env_compat: true,
-    cargo_pkg_version: "1.1.0",
+    cargo_pkg_version: "1.3.0",
     srcs: ["src/lib.rs"],
     test_suites: ["general-tests"],
     auto_gen_config: true,
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 50d2e6e..95552b4 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,7 @@
+# 1.2.0
+
+* Adds `bytes` module to support operating directly on byte strings.
+
 # 1.1.0
 
 * Adds the `std` feature (enabled by default)
diff --git a/Cargo.toml b/Cargo.toml
index 2741ed8..2b66892 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,19 +3,30 @@
 # When uploading crates to the registry Cargo will automatically
 # "normalize" Cargo.toml files for maximal compatibility
 # with all versions of Cargo and also rewrite `path` dependencies
-# to registry (e.g., crates.io) dependencies
+# to registry (e.g., crates.io) dependencies.
 #
-# If you believe there's an error in this file please file an
-# issue against the rust-lang/cargo repository. If you're
-# editing this file be aware that the upstream Cargo.toml
-# will likely look very different (and much more reasonable)
+# If you are reading this file be aware that the original Cargo.toml
+# will likely look very different (and much more reasonable).
+# See Cargo.toml.orig for the original contents.
 
 [package]
+rust-version = "1.46.0"
 name = "shlex"
-version = "1.1.0"
-authors = ["comex <comexk@gmail.com>", "Fenhl <fenhl@fenhl.net>"]
+version = "1.3.0"
+authors = [
+    "comex <comexk@gmail.com>",
+    "Fenhl <fenhl@fenhl.net>",
+    "Adrian Taylor <adetaylor@chromium.org>",
+    "Alex Touchet <alextouchet@outlook.com>",
+    "Daniel Parks <dp+git@oxidized.org>",
+    "Garrett Berg <googberg@gmail.com>",
+]
 description = "Split a string into shell words, like Python's shlex."
-categories = ["command-line-interface", "parser-implementations"]
+readme = "README.md"
+categories = [
+    "command-line-interface",
+    "parser-implementations",
+]
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/comex/rust-shlex"
 
diff --git a/Cargo.toml.orig b/Cargo.toml.orig
index 57fb62b..c3644af 100644
--- a/Cargo.toml.orig
+++ b/Cargo.toml.orig
@@ -1,9 +1,13 @@
 [package]
 name = "shlex"
-version = "1.1.0"
+version = "1.3.0"
 authors = [
     "comex <comexk@gmail.com>",
-    "Fenhl <fenhl@fenhl.net>"
+    "Fenhl <fenhl@fenhl.net>",
+    "Adrian Taylor <adetaylor@chromium.org>",
+    "Alex Touchet <alextouchet@outlook.com>",
+    "Daniel Parks <dp+git@oxidized.org>",
+    "Garrett Berg <googberg@gmail.com>",
 ]
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/comex/rust-shlex"
@@ -12,6 +16,7 @@ categories = [
     "command-line-interface",
     "parser-implementations"
 ]
+rust-version = "1.46.0"
 
 [features]
 std = []
diff --git a/METADATA b/METADATA
index 9ee61c1..b7ec356 100644
--- a/METADATA
+++ b/METADATA
@@ -1,19 +1,20 @@
+# This project was upgraded with external_updater.
+# Usage: tools/external_updater/updater.sh update external/rust/crates/shlex
+# For more info, check https://cs.android.com/android/platform/superproject/+/main:tools/external_updater/README.md
+
 name: "shlex"
 description: "Split a string into shell words, like Python\'s shlex."
 third_party {
-  url {
-    type: HOMEPAGE
-    value: "https://crates.io/crates/shlex"
-  }
-  url {
-    type: ARCHIVE
-    value: "https://static.crates.io/crates/shlex/shlex-1.1.0.crate"
-  }
-  version: "1.1.0"
   license_type: NOTICE
   last_upgrade_date {
-    year: 2021
-    month: 9
-    day: 22
+    year: 2024
+    month: 2
+    day: 5
+  }
+  homepage: "https://crates.io/crates/shlex"
+  identifier {
+    type: "Archive"
+    value: "https://static.crates.io/crates/shlex/shlex-1.3.0.crate"
+    version: "1.3.0"
   }
 }
diff --git a/README.md b/README.md
index 6778828..6400a6f 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,11 @@
+[![ci badge]][ci link] [![crates.io badge]][crates.io link] [![docs.rs badge]][docs.rs link]
+
+[crates.io badge]: https://img.shields.io/crates/v/shlex.svg?style=flat-square
+[crates.io link]: https://crates.io/crates/shlex
+[docs.rs badge]: https://img.shields.io/badge/docs-online-dddddd.svg?style=flat-square
+[docs.rs link]: https://docs.rs/shlex
+[ci badge]: https://img.shields.io/github/actions/workflow/status/comex/rust-shlex/test.yml?branch=master&style=flat-square
+[ci link]: https://github.com/comex/rust-shlex/actions
 
 Same idea as (but implementation not directly based on) the Python shlex
 module. However, this implementation does not support any of the Python
@@ -8,8 +16,9 @@ You only get the default settings of shlex.split, which mimic the POSIX shell:
 This implementation also deviates from the Python version in not treating \r
 specially, which I believe is more compliant.
 
-The algorithms in this crate are oblivious to UTF-8 high bytes, so they iterate
-over the bytes directly as a micro-optimization.
+This crate can be used on either normal Rust strings, or on byte strings with
+the `bytes` module. The algorithms used are oblivious to UTF-8 high bytes, so
+internally they all work on bytes directly as a micro-optimization.
 
 Disabling the `std` feature (which is enabled by default) will allow the crate
 to work in `no_std` environments, where the `alloc` crate, and a global
diff --git a/src/bytes.rs b/src/bytes.rs
new file mode 100644
index 0000000..af8daad
--- /dev/null
+++ b/src/bytes.rs
@@ -0,0 +1,576 @@
+// Copyright 2015 Nicholas Allegra (comex).
+// Licensed under the Apache License, Version 2.0 <https://www.apache.org/licenses/LICENSE-2.0> or
+// the MIT license <https://opensource.org/licenses/MIT>, at your option. This file may not be
+// copied, modified, or distributed except according to those terms.
+
+//! [`Shlex`] and friends for byte strings.
+//!
+//! This is used internally by the [outer module](crate), and may be more
+//! convenient if you are working with byte slices (`[u8]`) or types that are
+//! wrappers around bytes, such as [`OsStr`](std::ffi::OsStr):
+//!
+//! ```rust
+//! #[cfg(unix)] {
+//!     use shlex::bytes::quote;
+//!     use std::ffi::OsStr;
+//!     use std::os::unix::ffi::OsStrExt;
+//!
+//!     // `\x80` is invalid in UTF-8.
+//!     let os_str = OsStr::from_bytes(b"a\x80b c");
+//!     assert_eq!(quote(os_str.as_bytes()), &b"'a\x80b c'"[..]);
+//! }
+//! ```
+//!
+//! (On Windows, `OsStr` uses 16 bit wide characters so this will not work.)
+
+extern crate alloc;
+use alloc::vec::Vec;
+use alloc::borrow::Cow;
+#[cfg(test)]
+use alloc::vec;
+#[cfg(test)]
+use alloc::borrow::ToOwned;
+#[cfg(all(doc, not(doctest)))]
+use crate::{self as shlex, quoting_warning};
+
+use super::QuoteError;
+
+/// An iterator that takes an input byte string and splits it into the words using the same syntax as
+/// the POSIX shell.
+pub struct Shlex<'a> {
+    in_iter: core::slice::Iter<'a, u8>,
+    /// The number of newlines read so far, plus one.
+    pub line_no: usize,
+    /// An input string is erroneous if it ends while inside a quotation or right after an
+    /// unescaped backslash.  Since Iterator does not have a mechanism to return an error, if that
+    /// happens, Shlex just throws out the last token, ends the iteration, and sets 'had_error' to
+    /// true; best to check it after you're done iterating.
+    pub had_error: bool,
+}
+
+impl<'a> Shlex<'a> {
+    pub fn new(in_bytes: &'a [u8]) -> Self {
+        Shlex {
+            in_iter: in_bytes.iter(),
+            line_no: 1,
+            had_error: false,
+        }
+    }
+
+    fn parse_word(&mut self, mut ch: u8) -> Option<Vec<u8>> {
+        let mut result: Vec<u8> = Vec::new();
+        loop {
+            match ch as char {
+                '"' => if let Err(()) = self.parse_double(&mut result) {
+                    self.had_error = true;
+                    return None;
+                },
+                '\'' => if let Err(()) = self.parse_single(&mut result) {
+                    self.had_error = true;
+                    return None;
+                },
+                '\\' => if let Some(ch2) = self.next_char() {
+                    if ch2 != '\n' as u8 { result.push(ch2); }
+                } else {
+                    self.had_error = true;
+                    return None;
+                },
+                ' ' | '\t' | '\n' => { break; },
+                _ => { result.push(ch as u8); },
+            }
+            if let Some(ch2) = self.next_char() { ch = ch2; } else { break; }
+        }
+        Some(result)
+    }
+
+    fn parse_double(&mut self, result: &mut Vec<u8>) -> Result<(), ()> {
+        loop {
+            if let Some(ch2) = self.next_char() {
+                match ch2 as char {
+                    '\\' => {
+                        if let Some(ch3) = self.next_char() {
+                            match ch3 as char {
+                                // \$ => $
+                                '$' | '`' | '"' | '\\' => { result.push(ch3); },
+                                // \<newline> => nothing
+                                '\n' => {},
+                                // \x => =x
+                                _ => { result.push('\\' as u8); result.push(ch3); }
+                            }
+                        } else {
+                            return Err(());
+                        }
+                    },
+                    '"' => { return Ok(()); },
+                    _ => { result.push(ch2); },
+                }
+            } else {
+                return Err(());
+            }
+        }
+    }
+
+    fn parse_single(&mut self, result: &mut Vec<u8>) -> Result<(), ()> {
+        loop {
+            if let Some(ch2) = self.next_char() {
+                match ch2 as char {
+                    '\'' => { return Ok(()); },
+                    _ => { result.push(ch2); },
+                }
+            } else {
+                return Err(());
+            }
+        }
+    }
+
+    fn next_char(&mut self) -> Option<u8> {
+        let res = self.in_iter.next().copied();
+        if res == Some(b'\n') { self.line_no += 1; }
+        res
+    }
+}
+
+impl<'a> Iterator for Shlex<'a> {
+    type Item = Vec<u8>;
+    fn next(&mut self) -> Option<Self::Item> {
+        if let Some(mut ch) = self.next_char() {
+            // skip initial whitespace
+            loop {
+                match ch as char {
+                    ' ' | '\t' | '\n' => {},
+                    '#' => {
+                        while let Some(ch2) = self.next_char() {
+                            if ch2 as char == '\n' { break; }
+                        }
+                    },
+                    _ => { break; }
+                }
+                if let Some(ch2) = self.next_char() { ch = ch2; } else { return None; }
+            }
+            self.parse_word(ch)
+        } else { // no initial character
+            None
+        }
+    }
+
+}
+
+/// Convenience function that consumes the whole byte string at once.  Returns None if the input was
+/// erroneous.
+pub fn split(in_bytes: &[u8]) -> Option<Vec<Vec<u8>>> {
+    let mut shl = Shlex::new(in_bytes);
+    let res = shl.by_ref().collect();
+    if shl.had_error { None } else { Some(res) }
+}
+
+/// A more configurable interface to quote strings.  If you only want the default settings you can
+/// use the convenience functions [`try_quote`] and [`try_join`].
+///
+/// The string equivalent is [`shlex::Quoter`].
+#[derive(Default, Debug, Clone)]
+pub struct Quoter {
+    allow_nul: bool,
+    // TODO: more options
+}
+
+impl Quoter {
+    /// Create a new [`Quoter`] with default settings.
+    #[inline]
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Set whether to allow [nul bytes](quoting_warning#nul-bytes).  By default they are not
+    /// allowed and will result in an error of [`QuoteError::Nul`].
+    #[inline]
+    pub fn allow_nul(mut self, allow: bool) -> Self {
+        self.allow_nul = allow;
+        self
+    }
+
+    /// Convenience function that consumes an iterable of words and turns it into a single byte string,
+    /// quoting words when necessary. Consecutive words will be separated by a single space.
+    pub fn join<'a, I: IntoIterator<Item = &'a [u8]>>(&self, words: I) -> Result<Vec<u8>, QuoteError> {
+        Ok(words.into_iter()
+            .map(|word| self.quote(word))
+            .collect::<Result<Vec<Cow<[u8]>>, QuoteError>>()?
+            .join(&b' '))
+    }
+
+    /// Given a single word, return a byte string suitable to encode it as a shell argument.
+    ///
+    /// If given valid UTF-8, this will never produce invalid UTF-8. This is because it only
+    /// ever inserts valid ASCII characters before or after existing ASCII characters (or
+    /// returns two single quotes if the input was an empty string). It will never modify a
+    /// multibyte UTF-8 character.
+    pub fn quote<'a>(&self, mut in_bytes: &'a [u8]) -> Result<Cow<'a, [u8]>, QuoteError> {
+        if in_bytes.is_empty() {
+            // Empty string.  Special case that isn't meaningful as only part of a word.
+            return Ok(b"''"[..].into());
+        }
+        if !self.allow_nul && in_bytes.iter().any(|&b| b == b'\0') {
+            return Err(QuoteError::Nul);
+        }
+        let mut out: Vec<u8> = Vec::new();
+        while !in_bytes.is_empty() {
+            // Pick a quoting strategy for some prefix of the input.  Normally this will cover the
+            // entire input, but in some case we might need to divide the input into multiple chunks
+            // that are quoted differently.
+            let (cur_len, strategy) = quoting_strategy(in_bytes);
+            if cur_len == in_bytes.len() && strategy == QuotingStrategy::Unquoted && out.is_empty() {
+                // Entire string can be represented unquoted.  Reuse the allocation.
+                return Ok(in_bytes.into());
+            }
+            let (cur_chunk, rest) = in_bytes.split_at(cur_len);
+            assert!(rest.len() < in_bytes.len()); // no infinite loop
+            in_bytes = rest;
+            append_quoted_chunk(&mut out, cur_chunk, strategy);
+        }
+        Ok(out.into())
+    }
+
+}
+
+#[derive(PartialEq)]
+enum QuotingStrategy {
+    /// No quotes and no backslash escapes.  (If backslash escapes would be necessary, we use a
+    /// different strategy instead.)
+    Unquoted,
+    /// Single quoted.
+    SingleQuoted,
+    /// Double quotes, potentially with backslash escapes.
+    DoubleQuoted,
+    // TODO: add $'xxx' and "$(printf 'xxx')" styles
+}
+
+/// Is this ASCII byte okay to emit unquoted?
+const fn unquoted_ok(c: u8) -> bool {
+    match c as char {
+        // Allowed characters:
+        '+' | '-' | '.' | '/' | ':' | '@' | ']' | '_' |
+        '0'..='9' | 'A'..='Z' | 'a'..='z'
+        => true,
+
+        // Non-allowed characters:
+        // From POSIX https://pubs.opengroup.org/onlinepubs/9699919799/utilities/V3_chap02.html
+        // "The application shall quote the following characters if they are to represent themselves:"
+        '|' | '&' | ';' | '<' | '>' | '(' | ')' | '$' | '`' | '\\' | '"' | '\'' | ' ' | '\t' | '\n' |
+        // "and the following may need to be quoted under certain circumstances[..]:"
+        '*' | '?' | '[' | '#' | '~' | '=' | '%' |
+        // Brace expansion.  These ought to be in the POSIX list but aren't yet;
+        // see: https://www.austingroupbugs.net/view.php?id=1193
+        '{' | '}' |
+        // Also quote comma, just to be safe in the extremely odd case that the user of this crate
+        // is intentionally placing a quoted string inside a brace expansion, e.g.:
+        //     format!("echo foo{{a,b,{}}}" | shlex::quote(some_str))
+        ',' |
+        // '\r' is allowed in a word by all real shells I tested, but is treated as a word
+        // separator by Python `shlex` | and might be translated to '\n' in interactive mode.
+        '\r' |
+        // '!' and '^' are treated specially in interactive mode; see quoting_warning.
+        '!' | '^' |
+        // Nul bytes and control characters.
+        '\x00' ..= '\x1f' | '\x7f'
+        => false,
+        '\u{80}' ..= '\u{10ffff}' => {
+            // This is unreachable since `unquoted_ok` is only called for 0..128.
+            // Non-ASCII bytes are handled separately in `quoting_strategy`.
+            // Can't call unreachable!() from `const fn` on old Rust, so...
+            unquoted_ok(c)
+        },
+    }
+    // Note: The logic cited above for quoting comma might suggest that `..` should also be quoted,
+    // it as a special case of brace expansion).  But it's not necessary.  There are three cases:
+    //
+    // 1. The user wants comma-based brace expansion, but the untrusted string being `quote`d
+    //    contains `..`, so they get something like `{foo,bar,3..5}`.
+    //  => That's safe; both Bash and Zsh expand this to `foo bar 3..5` rather than
+    //     `foo bar 3 4 5`.  The presence of commas disables sequence expression expansion.
+    //
+    // 2. The user wants comma-based brace expansion where the contents of the braces are a
+    //    variable number of `quote`d strings and nothing else.  There happens to be exactly
+    //    one string and it contains `..`, so they get something like `{3..5}`.
+    //  => Then this will expand as a sequence expression, which is unintended.  But I don't mind,
+    //     because any such code is already buggy.  Suppose the untrusted string *didn't* contain
+    //     `,` or `..`, resulting in shell input like `{foo}`.  Then the shell would interpret it
+    //     as the literal string `{foo}` rather than brace-expanding it into `foo`.
+    //
+    // 3. The user wants a sequence expression and wants to supply an untrusted string as one of
+    //    the endpoints or the increment.
+    //  => Well, that's just silly, since the endpoints can only be numbers or single letters.
+}
+
+/// Optimized version of `unquoted_ok`.
+fn unquoted_ok_fast(c: u8) -> bool {
+    const UNQUOTED_OK_MASK: u128 = {
+        // Make a mask of all bytes in 0..<0x80 that pass.
+        let mut c = 0u8;
+        let mut mask = 0u128;
+        while c < 0x80 {
+            if unquoted_ok(c) {
+                mask |= 1u128 << c;
+            }
+            c += 1;
+        }
+        mask
+    };
+    ((UNQUOTED_OK_MASK >> c) & 1) != 0
+}
+
+/// Is this ASCII byte okay to emit in single quotes?
+fn single_quoted_ok(c: u8) -> bool {
+    match c {
+        // No single quotes in single quotes.
+        b'\'' => false,
+        // To work around a Bash bug, ^ is only allowed right after an opening single quote; see
+        // quoting_warning.
+        b'^' => false,
+        // Backslashes in single quotes are literal according to POSIX, but Fish treats them as an
+        // escape character.  Ban them.  Fish doesn't aim to be POSIX-compatible, but we *can*
+        // achieve Fish compatibility using double quotes, so we might as well.
+        b'\\' => false,
+        _ => true
+    }
+}
+
+/// Is this ASCII byte okay to emit in double quotes?
+fn double_quoted_ok(c: u8) -> bool {
+    match c {
+        // Work around Python `shlex` bug where parsing "\`" and "\$" doesn't strip the
+        // backslash, even though POSIX requires it.
+        b'`' | b'$' => false,
+        // '!' and '^' are treated specially in interactive mode; see quoting_warning.
+        b'!' | b'^' => false,
+        _ => true
+    }
+}
+
+/// Given an input, return a quoting strategy that can cover some prefix of the string, along with
+/// the size of that prefix.
+///
+/// Precondition: input size is nonzero.  (Empty strings are handled by the caller.)
+/// Postcondition: returned size is nonzero.
+#[cfg_attr(manual_codegen_check, inline(never))]
+fn quoting_strategy(in_bytes: &[u8]) -> (usize, QuotingStrategy) {
+    const UNQUOTED_OK: u8 = 1;
+    const SINGLE_QUOTED_OK: u8 = 2;
+    const DOUBLE_QUOTED_OK: u8 = 4;
+
+    let mut prev_ok = SINGLE_QUOTED_OK | DOUBLE_QUOTED_OK | UNQUOTED_OK;
+    let mut i = 0;
+
+    if in_bytes[0] == b'^' {
+        // To work around a Bash bug, ^ is only allowed right after an opening single quote; see
+        // quoting_warning.
+        prev_ok = SINGLE_QUOTED_OK;
+        i = 1;
+    }
+
+    while i < in_bytes.len() {
+        let c = in_bytes[i];
+        let mut cur_ok = prev_ok;
+
+        if c >= 0x80 {
+            // Normally, non-ASCII characters shouldn't require quoting, but see quoting_warning.md
+            // about \xa0.  For now, just treat all non-ASCII characters as requiring quotes.  This
+            // also ensures things are safe in the off-chance that you're in a legacy 8-bit locale that
+            // has additional characters satisfying `isblank`.
+            cur_ok &= !UNQUOTED_OK;
+        } else {
+            if !unquoted_ok_fast(c) {
+                cur_ok &= !UNQUOTED_OK;
+            }
+            if !single_quoted_ok(c){
+                cur_ok &= !SINGLE_QUOTED_OK;
+            }
+            if !double_quoted_ok(c) {
+                cur_ok &= !DOUBLE_QUOTED_OK;
+            }
+        }
+
+        if cur_ok == 0 {
+            // There are no quoting strategies that would work for both the previous characters and
+            // this one.  So we have to end the chunk before this character.  The caller will call
+            // `quoting_strategy` again to handle the rest of the string.
+            break;
+        }
+
+        prev_ok = cur_ok;
+        i += 1;
+    }
+
+    // Pick the best allowed strategy.
+    let strategy = if prev_ok & UNQUOTED_OK != 0 {
+        QuotingStrategy::Unquoted
+    } else if prev_ok & SINGLE_QUOTED_OK != 0 {
+        QuotingStrategy::SingleQuoted
+    } else if prev_ok & DOUBLE_QUOTED_OK != 0 {
+        QuotingStrategy::DoubleQuoted
+    } else {
+        unreachable!()
+    };
+    debug_assert!(i > 0);
+    (i, strategy)
+}
+
+fn append_quoted_chunk(out: &mut Vec<u8>, cur_chunk: &[u8], strategy: QuotingStrategy) {
+    match strategy {
+        QuotingStrategy::Unquoted => {
+            out.extend_from_slice(cur_chunk);
+        },
+        QuotingStrategy::SingleQuoted => {
+            out.reserve(cur_chunk.len() + 2);
+            out.push(b'\'');
+            out.extend_from_slice(cur_chunk);
+            out.push(b'\'');
+        },
+        QuotingStrategy::DoubleQuoted => {
+            out.reserve(cur_chunk.len() + 2);
+            out.push(b'"');
+            for &c in cur_chunk.into_iter() {
+                if let b'$' | b'`' | b'"' | b'\\' = c {
+                    // Add a preceding backslash.
+                    // Note: We shouldn't actually get here for $ and ` because they don't pass
+                    // `double_quoted_ok`.
+                    out.push(b'\\');
+                }
+                // Add the character itself.
+                out.push(c);
+            }
+            out.push(b'"');
+        },
+    }
+}
+
+/// Convenience function that consumes an iterable of words and turns it into a single byte string,
+/// quoting words when necessary. Consecutive words will be separated by a single space.
+///
+/// Uses default settings except that nul bytes are passed through, which [may be
+/// dangerous](quoting_warning#nul-bytes), leading to this function being deprecated.
+///
+/// Equivalent to [`Quoter::new().allow_nul(true).join(words).unwrap()`](Quoter).
+///
+/// (That configuration never returns `Err`, so this function does not panic.)
+///
+/// The string equivalent is [shlex::join].
+#[deprecated(since = "1.3.0", note = "replace with `try_join(words)?` to avoid nul byte danger")]
+pub fn join<'a, I: IntoIterator<Item = &'a [u8]>>(words: I) -> Vec<u8> {
+    Quoter::new().allow_nul(true).join(words).unwrap()
+}
+
+/// Convenience function that consumes an iterable of words and turns it into a single byte string,
+/// quoting words when necessary. Consecutive words will be separated by a single space.
+///
+/// Uses default settings.  The only error that can be returned is [`QuoteError::Nul`].
+///
+/// Equivalent to [`Quoter::new().join(words)`](Quoter).
+///
+/// The string equivalent is [shlex::try_join].
+pub fn try_join<'a, I: IntoIterator<Item = &'a [u8]>>(words: I) -> Result<Vec<u8>, QuoteError> {
+    Quoter::new().join(words)
+}
+
+/// Given a single word, return a string suitable to encode it as a shell argument.
+///
+/// Uses default settings except that nul bytes are passed through, which [may be
+/// dangerous](quoting_warning#nul-bytes), leading to this function being deprecated.
+///
+/// Equivalent to [`Quoter::new().allow_nul(true).quote(in_bytes).unwrap()`](Quoter).
+///
+/// (That configuration never returns `Err`, so this function does not panic.)
+///
+/// The string equivalent is [shlex::quote].
+#[deprecated(since = "1.3.0", note = "replace with `try_quote(str)?` to avoid nul byte danger")]
+pub fn quote(in_bytes: &[u8]) -> Cow<[u8]> {
+    Quoter::new().allow_nul(true).quote(in_bytes).unwrap()
+}
+
+/// Given a single word, return a string suitable to encode it as a shell argument.
+///
+/// Uses default settings.  The only error that can be returned is [`QuoteError::Nul`].
+///
+/// Equivalent to [`Quoter::new().quote(in_bytes)`](Quoter).
+///
+/// (That configuration never returns `Err`, so this function does not panic.)
+///
+/// The string equivalent is [shlex::try_quote].
+pub fn try_quote(in_bytes: &[u8]) -> Result<Cow<[u8]>, QuoteError> {
+    Quoter::new().quote(in_bytes)
+}
+
+#[cfg(test)]
+const INVALID_UTF8: &[u8] = b"\xa1";
+#[cfg(test)]
+const INVALID_UTF8_SINGLEQUOTED: &[u8] = b"'\xa1'";
+
+#[test]
+#[allow(invalid_from_utf8)]
+fn test_invalid_utf8() {
+    // Check that our test string is actually invalid UTF-8.
+    assert!(core::str::from_utf8(INVALID_UTF8).is_err());
+}
+
+#[cfg(test)]
+static SPLIT_TEST_ITEMS: &'static [(&'static [u8], Option<&'static [&'static [u8]]>)] = &[
+    (b"foo$baz", Some(&[b"foo$baz"])),
+    (b"foo baz", Some(&[b"foo", b"baz"])),
+    (b"foo\"bar\"baz", Some(&[b"foobarbaz"])),
+    (b"foo \"bar\"baz", Some(&[b"foo", b"barbaz"])),
+    (b"   foo \nbar", Some(&[b"foo", b"bar"])),
+    (b"foo\\\nbar", Some(&[b"foobar"])),
+    (b"\"foo\\\nbar\"", Some(&[b"foobar"])),
+    (b"'baz\\$b'", Some(&[b"baz\\$b"])),
+    (b"'baz\\\''", None),
+    (b"\\", None),
+    (b"\"\\", None),
+    (b"'\\", None),
+    (b"\"", None),
+    (b"'", None),
+    (b"foo #bar\nbaz", Some(&[b"foo", b"baz"])),
+    (b"foo #bar", Some(&[b"foo"])),
+    (b"foo#bar", Some(&[b"foo#bar"])),
+    (b"foo\"#bar", None),
+    (b"'\\n'", Some(&[b"\\n"])),
+    (b"'\\\\n'", Some(&[b"\\\\n"])),
+    (INVALID_UTF8, Some(&[INVALID_UTF8])),
+];
+
+#[test]
+fn test_split() {
+    for &(input, output) in SPLIT_TEST_ITEMS {
+        assert_eq!(split(input), output.map(|o| o.iter().map(|&x| x.to_owned()).collect()));
+    }
+}
+
+#[test]
+fn test_lineno() {
+    let mut sh = Shlex::new(b"\nfoo\nbar");
+    while let Some(word) = sh.next() {
+        if word == b"bar" {
+            assert_eq!(sh.line_no, 3);
+        }
+    }
+}
+
+#[test]
+#[allow(deprecated)]
+fn test_quote() {
+    // Validate behavior with invalid UTF-8:
+    assert_eq!(quote(INVALID_UTF8), INVALID_UTF8_SINGLEQUOTED);
+    // Replicate a few tests from lib.rs.  No need to replicate all of them.
+    assert_eq!(quote(b""), &b"''"[..]);
+    assert_eq!(quote(b"foobar"), &b"foobar"[..]);
+    assert_eq!(quote(b"foo bar"), &b"'foo bar'"[..]);
+    assert_eq!(quote(b"'\""), &b"\"'\\\"\""[..]);
+    assert_eq!(quote(b""), &b"''"[..]);
+}
+
+#[test]
+#[allow(deprecated)]
+fn test_join() {
+    // Validate behavior with invalid UTF-8:
+    assert_eq!(join(vec![INVALID_UTF8]), INVALID_UTF8_SINGLEQUOTED);
+    // Replicate a few tests from lib.rs.  No need to replicate all of them.
+    assert_eq!(join(vec![]), &b""[..]);
+    assert_eq!(join(vec![&b""[..]]), b"''");
+}
diff --git a/src/lib.rs b/src/lib.rs
index 31b54bd..aa5c306 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -3,20 +3,37 @@
 // the MIT license <https://opensource.org/licenses/MIT>, at your option. This file may not be
 // copied, modified, or distributed except according to those terms.
 
-//! Same idea as (but implementation not directly based on) the Python shlex module.  However, this
-//! implementation does not support any of the Python module's customization because it makes
-//! parsing slower and is fairly useless.  You only get the default settings of shlex.split, which
-//! mimic the POSIX shell:
-//! <https://pubs.opengroup.org/onlinepubs/9699919799/utilities/V3_chap02.html>
+//! Parse strings like, and escape strings for, POSIX shells.
 //!
-//! This implementation also deviates from the Python version in not treating `\r` specially, which
-//! I believe is more compliant.
-//!
-//! The algorithms in this crate are oblivious to UTF-8 high bytes, so they iterate over the bytes
-//! directly as a micro-optimization.
+//! Same idea as (but implementation not directly based on) the Python shlex module.
 //!
 //! Disabling the `std` feature (which is enabled by default) will allow the crate to work in
 //! `no_std` environments, where the `alloc` crate, and a global allocator, are available.
+//!
+//! ## <span style="color:red">Warning</span>
+//!
+//! The [`try_quote`]/[`try_join`] family of APIs does not quote control characters (because they
+//! cannot be quoted portably).
+//!
+//! This is fully safe in noninteractive contexts, like shell scripts and `sh -c` arguments (or
+//! even scripts `source`d from interactive shells).
+//!
+//! But if you are quoting for human consumption, you should keep in mind that ugly inputs produce
+//! ugly outputs (which may not be copy-pastable).
+//!
+//! And if by chance you are piping the output of [`try_quote`]/[`try_join`] directly to the stdin
+//! of an interactive shell, you should stop, because control characters can lead to arbitrary
+//! command injection.
+//!
+//! For more information, and for information about more minor issues, please see [quoting_warning].
+//!
+//! ## Compatibility
+//!
+//! This crate's quoting functionality tries to be compatible with **any POSIX-compatible shell**;
+//! it's tested against `bash`, `zsh`, `dash`, Busybox `ash`, and `mksh`, plus `fish` (which is not
+//! POSIX-compatible but close enough).
+//!
+//! It also aims to be compatible with Python `shlex` and C `wordexp`.
 
 #![cfg_attr(not(feature = "std"), no_std)]
 
@@ -29,124 +46,45 @@ use alloc::vec;
 #[cfg(test)]
 use alloc::borrow::ToOwned;
 
+pub mod bytes;
+#[cfg(all(doc, not(doctest)))]
+#[path = "quoting_warning.md"]
+pub mod quoting_warning;
+
 /// An iterator that takes an input string and splits it into the words using the same syntax as
 /// the POSIX shell.
-pub struct Shlex<'a> {
-    in_iter: core::str::Bytes<'a>,
-    /// The number of newlines read so far, plus one.
-    pub line_no: usize,
-    /// An input string is erroneous if it ends while inside a quotation or right after an
-    /// unescaped backslash.  Since Iterator does not have a mechanism to return an error, if that
-    /// happens, Shlex just throws out the last token, ends the iteration, and sets 'had_error' to
-    /// true; best to check it after you're done iterating.
-    pub had_error: bool,
-}
+///
+/// See [`bytes::Shlex`].
+pub struct Shlex<'a>(bytes::Shlex<'a>);
 
 impl<'a> Shlex<'a> {
     pub fn new(in_str: &'a str) -> Self {
-        Shlex {
-            in_iter: in_str.bytes(),
-            line_no: 1,
-            had_error: false,
-        }
-    }
-
-    fn parse_word(&mut self, mut ch: u8) -> Option<String> {
-        let mut result: Vec<u8> = Vec::new();
-        loop {
-            match ch as char {
-                '"' => if let Err(()) = self.parse_double(&mut result) {
-                    self.had_error = true;
-                    return None;
-                },
-                '\'' => if let Err(()) = self.parse_single(&mut result) {
-                    self.had_error = true;
-                    return None;
-                },
-                '\\' => if let Some(ch2) = self.next_char() {
-                    if ch2 != '\n' as u8 { result.push(ch2); }
-                } else {
-                    self.had_error = true;
-                    return None;
-                },
-                ' ' | '\t' | '\n' => { break; },
-                _ => { result.push(ch as u8); },
-            }
-            if let Some(ch2) = self.next_char() { ch = ch2; } else { break; }
-        }
-        unsafe { Some(String::from_utf8_unchecked(result)) }
+        Self(bytes::Shlex::new(in_str.as_bytes()))
     }
+}
 
-    fn parse_double(&mut self, result: &mut Vec<u8>) -> Result<(), ()> {
-        loop {
-            if let Some(ch2) = self.next_char() {
-                match ch2 as char {
-                    '\\' => {
-                        if let Some(ch3) = self.next_char() {
-                            match ch3 as char {
-                                // \$ => $
-                                '$' | '`' | '"' | '\\' => { result.push(ch3); },
-                                // \<newline> => nothing
-                                '\n' => {},
-                                // \x => =x
-                                _ => { result.push('\\' as u8); result.push(ch3); }
-                            }
-                        } else {
-                            return Err(());
-                        }
-                    },
-                    '"' => { return Ok(()); },
-                    _ => { result.push(ch2); },
-                }
-            } else {
-                return Err(());
-            }
-        }
+impl<'a> Iterator for Shlex<'a> {
+    type Item = String;
+    fn next(&mut self) -> Option<String> {
+        self.0.next().map(|byte_word| {
+            // Safety: given valid UTF-8, bytes::Shlex will always return valid UTF-8.
+            unsafe { String::from_utf8_unchecked(byte_word) }
+        })
     }
+}
 
-    fn parse_single(&mut self, result: &mut Vec<u8>) -> Result<(), ()> {
-        loop {
-            if let Some(ch2) = self.next_char() {
-                match ch2 as char {
-                    '\'' => { return Ok(()); },
-                    _ => { result.push(ch2); },
-                }
-            } else {
-                return Err(());
-            }
-        }
-    }
+impl<'a> core::ops::Deref for Shlex<'a> {
+    type Target = bytes::Shlex<'a>;
 
-    fn next_char(&mut self) -> Option<u8> {
-        let res = self.in_iter.next();
-        if res == Some('\n' as u8) { self.line_no += 1; }
-        res
+    fn deref(&self) -> &Self::Target {
+        &self.0
     }
 }
 
-impl<'a> Iterator for Shlex<'a> {
-    type Item = String;
-    fn next(&mut self) -> Option<String> {
-        if let Some(mut ch) = self.next_char() {
-            // skip initial whitespace
-            loop {
-                match ch as char {
-                    ' ' | '\t' | '\n' => {},
-                    '#' => {
-                        while let Some(ch2) = self.next_char() {
-                            if ch2 as char == '\n' { break; }
-                        }
-                    },
-                    _ => { break; }
-                }
-                if let Some(ch2) = self.next_char() { ch = ch2; } else { return None; }
-            }
-            self.parse_word(ch)
-        } else { // no initial character
-            None
-        }
+impl<'a> core::ops::DerefMut for Shlex<'a> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.0
     }
-
 }
 
 /// Convenience function that consumes the whole string at once.  Returns None if the input was
@@ -157,38 +95,151 @@ pub fn split(in_str: &str) -> Option<Vec<String>> {
     if shl.had_error { None } else { Some(res) }
 }
 
-/// Given a single word, return a string suitable to encode it as a shell argument.
-pub fn quote(in_str: &str) -> Cow<str> {
-    if in_str.len() == 0 {
-        "\"\"".into()
-    } else if in_str.bytes().any(|c| match c as char {
-        '|' | '&' | ';' | '<' | '>' | '(' | ')' | '$' | '`' | '\\' | '"' | '\'' | ' ' | '\t' |
-        '\r' | '\n' | '*' | '?' | '[' | '#' | '~' | '=' | '%' => true,
-        _ => false
-    }) {
-        let mut out: Vec<u8> = Vec::new();
-        out.push('"' as u8);
-        for c in in_str.bytes() {
-            match c as char {
-                '$' | '`' | '"' | '\\' => out.push('\\' as u8),
-                _ => ()
-            }
-            out.push(c);
+/// Errors from [`Quoter::quote`], [`Quoter::join`], etc. (and their [`bytes`] counterparts).
+///
+/// By default, the only error that can be returned is [`QuoteError::Nul`].  If you call
+/// `allow_nul(true)`, then no errors can be returned at all.  Any error variants added in the
+/// future will not be enabled by default; they will be enabled through corresponding non-default
+/// [`Quoter`] options.
+///
+/// ...In theory.  In the unlikely event that additional classes of inputs are discovered that,
+/// like nul bytes, are fundamentally unsafe to quote even for non-interactive shells, the risk
+/// will be mitigated by adding corresponding [`QuoteError`] variants that *are* enabled by
+/// default.
+#[non_exhaustive]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub enum QuoteError {
+    /// The input contained a nul byte.  In most cases, shells fundamentally [cannot handle strings
+    /// containing nul bytes](quoting_warning#nul-bytes), no matter how they are quoted.  But if
+    /// you're sure you can handle nul bytes, you can call `allow_nul(true)` on the `Quoter` to let
+    /// them pass through.
+    Nul,
+}
+
+impl core::fmt::Display for QuoteError {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        match self {
+            QuoteError::Nul => f.write_str("cannot shell-quote string containing nul byte"),
         }
-        out.push('"' as u8);
-        unsafe { String::from_utf8_unchecked(out) }.into()
-    } else {
-        in_str.into()
+    }
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for QuoteError {}
+
+/// A more configurable interface to quote strings.  If you only want the default settings you can
+/// use the convenience functions [`try_quote`] and [`try_join`].
+///
+/// The bytes equivalent is [`bytes::Quoter`].
+#[derive(Default, Debug, Clone)]
+pub struct Quoter {
+    inner: bytes::Quoter,
+}
+
+impl Quoter {
+    /// Create a new [`Quoter`] with default settings.
+    #[inline]
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Set whether to allow [nul bytes](quoting_warning#nul-bytes).  By default they are not
+    /// allowed and will result in an error of [`QuoteError::Nul`].
+    #[inline]
+    pub fn allow_nul(mut self, allow: bool) -> Self {
+        self.inner = self.inner.allow_nul(allow);
+        self
+    }
+
+    /// Convenience function that consumes an iterable of words and turns it into a single string,
+    /// quoting words when necessary. Consecutive words will be separated by a single space.
+    pub fn join<'a, I: IntoIterator<Item = &'a str>>(&self, words: I) -> Result<String, QuoteError> {
+        // Safety: given valid UTF-8, bytes::join() will always return valid UTF-8.
+        self.inner.join(words.into_iter().map(|s| s.as_bytes()))
+            .map(|bytes| unsafe { String::from_utf8_unchecked(bytes) })
+    }
+
+    /// Given a single word, return a string suitable to encode it as a shell argument.
+    pub fn quote<'a>(&self, in_str: &'a str) -> Result<Cow<'a, str>, QuoteError> {
+        Ok(match self.inner.quote(in_str.as_bytes())? {
+            Cow::Borrowed(out) => {
+                // Safety: given valid UTF-8, bytes::quote() will always return valid UTF-8.
+                unsafe { core::str::from_utf8_unchecked(out) }.into()
+            }
+            Cow::Owned(out) => {
+                // Safety: given valid UTF-8, bytes::quote() will always return valid UTF-8.
+                unsafe { String::from_utf8_unchecked(out) }.into()
+            }
+        })
+    }
+}
+
+impl From<bytes::Quoter> for Quoter {
+    fn from(inner: bytes::Quoter) -> Quoter {
+        Quoter { inner }
+    }
+}
+
+impl From<Quoter> for bytes::Quoter {
+    fn from(quoter: Quoter) -> bytes::Quoter {
+        quoter.inner
     }
 }
 
 /// Convenience function that consumes an iterable of words and turns it into a single string,
 /// quoting words when necessary. Consecutive words will be separated by a single space.
+///
+/// Uses default settings except that nul bytes are passed through, which [may be
+/// dangerous](quoting_warning#nul-bytes), leading to this function being deprecated.
+///
+/// Equivalent to [`Quoter::new().allow_nul(true).join(words).unwrap()`](Quoter).
+///
+/// (That configuration never returns `Err`, so this function does not panic.)
+///
+/// The bytes equivalent is [bytes::join].
+#[deprecated(since = "1.3.0", note = "replace with `try_join(words)?` to avoid nul byte danger")]
 pub fn join<'a, I: IntoIterator<Item = &'a str>>(words: I) -> String {
-    words.into_iter()
-        .map(quote)
-        .collect::<Vec<_>>()
-        .join(" ")
+    Quoter::new().allow_nul(true).join(words).unwrap()
+}
+
+/// Convenience function that consumes an iterable of words and turns it into a single string,
+/// quoting words when necessary. Consecutive words will be separated by a single space.
+///
+/// Uses default settings.  The only error that can be returned is [`QuoteError::Nul`].
+///
+/// Equivalent to [`Quoter::new().join(words)`](Quoter).
+///
+/// The bytes equivalent is [bytes::try_join].
+pub fn try_join<'a, I: IntoIterator<Item = &'a str>>(words: I) -> Result<String, QuoteError> {
+    Quoter::new().join(words)
+}
+
+/// Given a single word, return a string suitable to encode it as a shell argument.
+///
+/// Uses default settings except that nul bytes are passed through, which [may be
+/// dangerous](quoting_warning#nul-bytes), leading to this function being deprecated.
+///
+/// Equivalent to [`Quoter::new().allow_nul(true).quote(in_str).unwrap()`](Quoter).
+///
+/// (That configuration never returns `Err`, so this function does not panic.)
+///
+/// The bytes equivalent is [bytes::quote].
+#[deprecated(since = "1.3.0", note = "replace with `try_quote(str)?` to avoid nul byte danger")]
+pub fn quote(in_str: &str) -> Cow<str> {
+    Quoter::new().allow_nul(true).quote(in_str).unwrap()
+}
+
+/// Given a single word, return a string suitable to encode it as a shell argument.
+///
+/// Uses default settings.  The only error that can be returned is [`QuoteError::Nul`].
+///
+/// Equivalent to [`Quoter::new().quote(in_str)`](Quoter).
+///
+/// (That configuration never returns `Err`, so this function does not panic.)
+///
+/// The bytes equivalent is [bytes::try_quote].
+pub fn try_quote(in_str: &str) -> Result<Cow<str>, QuoteError> {
+    Quoter::new().quote(in_str)
 }
 
 #[cfg(test)]
@@ -233,17 +284,75 @@ fn test_lineno() {
 }
 
 #[test]
+#[cfg_attr(not(feature = "std"), allow(unreachable_code, unused_mut))]
 fn test_quote() {
-    assert_eq!(quote("foobar"), "foobar");
-    assert_eq!(quote("foo bar"), "\"foo bar\"");
-    assert_eq!(quote("\""), "\"\\\"\"");
-    assert_eq!(quote(""), "\"\"");
+    // This is a list of (unquoted, quoted) pairs.
+    // But it's using a single long (raw) string literal with an ad-hoc format, just because it's
+    // hard to read if we have to put the test strings through Rust escaping on top of the escaping
+    // being tested.  (Even raw string literals are noisy for short strings).
+    // Ad-hoc: "NL" is replaced with a literal newline; no other escape sequences.
+    let tests = r#"
+        <>                => <''>
+        <foobar>          => <foobar>
+        <foo bar>         => <'foo bar'>
+        <"foo bar'">      => <"\"foo bar'\"">
+        <'foo bar'>       => <"'foo bar'">
+        <">               => <'"'>
+        <"'>              => <"\"'">
+        <hello!world>     => <'hello!world'>
+        <'hello!world>    => <"'hello"'!world'>
+        <'hello!>         => <"'hello"'!'>
+        <hello ^ world>   => <'hello ''^ world'>
+        <hello^>          => <hello'^'>
+        <!world'>         => <'!world'"'">
+        <{a, b}>          => <'{a, b}'>
+        <NL>              => <'NL'>
+        <^>               => <'^'>
+        <foo^bar>         => <foo'^bar'>
+        <NLx^>            => <'NLx''^'>
+        <NL^x>            => <'NL''^x'>
+        <NL ^x>           => <'NL ''^x'>
+        <{a,b}>           => <'{a,b}'>
+        <a,b>             => <'a,b'>
+        <a..b             => <a..b>
+        <'$>              => <"'"'$'>
+        <"^>              => <'"''^'>
+    "#;
+    let mut ok = true;
+    for test in tests.trim().split('\n') {
+        let parts: Vec<String> = test
+            .replace("NL", "\n")
+            .split("=>")
+            .map(|part| part.trim().trim_start_matches('<').trim_end_matches('>').to_owned())
+            .collect();
+        assert!(parts.len() == 2);
+        let unquoted = &*parts[0];
+        let quoted_expected = &*parts[1];
+        let quoted_actual = try_quote(&parts[0]).unwrap();
+        if quoted_expected != quoted_actual {
+            #[cfg(not(feature = "std"))]
+            panic!("FAIL: for input <{}>, expected <{}>, got <{}>",
+                     unquoted, quoted_expected, quoted_actual);
+            #[cfg(feature = "std")]
+            println!("FAIL: for input <{}>, expected <{}>, got <{}>",
+                     unquoted, quoted_expected, quoted_actual);
+            ok = false;
+        }
+    }
+    assert!(ok);
 }
 
 #[test]
+#[allow(deprecated)]
 fn test_join() {
     assert_eq!(join(vec![]), "");
-    assert_eq!(join(vec![""]), "\"\"");
+    assert_eq!(join(vec![""]), "''");
     assert_eq!(join(vec!["a", "b"]), "a b");
-    assert_eq!(join(vec!["foo bar", "baz"]), "\"foo bar\" baz");
+    assert_eq!(join(vec!["foo bar", "baz"]), "'foo bar' baz");
+}
+
+#[test]
+fn test_fallible() {
+    assert_eq!(try_join(vec!["\0"]), Err(QuoteError::Nul));
+    assert_eq!(try_quote("\0"), Err(QuoteError::Nul));
 }
diff --git a/src/quoting_warning.md b/src/quoting_warning.md
new file mode 100644
index 0000000..fab9857
--- /dev/null
+++ b/src/quoting_warning.md
@@ -0,0 +1,365 @@
+// vim: textwidth=99
+/*
+Meta note: This file is loaded as a .rs file by rustdoc only.
+*/
+/*!
+
+A more detailed version of the [warning at the top level](super#warning) about the `quote`/`join`
+family of APIs.
+
+In general, passing the output of these APIs to a shell should recover the original string(s).
+This page lists cases where it fails to do so.
+
+In noninteractive contexts, there are only minor issues.  'Noninteractive' includes shell scripts
+and `sh -c` arguments, or even scripts `source`d from interactive shells.  The issues are:
+
+- [Nul bytes](#nul-bytes)
+
+- [Overlong commands](#overlong-commands)
+
+If you are writing directly to the stdin of an interactive (`-i`) shell (i.e., if you are
+pretending to be a terminal), or if you are writing to a cooked-mode pty (even if the other end is
+noninteractive), then there is a **severe** security issue:
+
+- [Control characters](#control-characters-interactive-contexts-only)
+
+Finally, there are some [solved issues](#solved-issues).
+
+# List of issues
+
+## Nul bytes
+
+For non-interactive shells, the most problematic input is nul bytes (bytes with value 0).  The
+non-deprecated functions all default to returning [`QuoteError::Nul`] when encountering them, but
+the deprecated [`quote`] and [`join`] functions leave them as-is.
+
+In Unix, nul bytes can't appear in command arguments, environment variables, or filenames.  It's
+not a question of proper quoting; they just can't be used at all.  This is a consequence of Unix's
+system calls all being designed around nul-terminated C strings.
+
+Shells inherit that limitation.  Most of them do not accept nul bytes in strings even internally.
+Even when they do, it's pretty much useless or even dangerous, since you can't pass them to
+external commands.
+
+In some cases, you might fail to pass the nul byte to the shell in the first place.  For example,
+the following code uses [`join`] to tunnel a command over an SSH connection:
+
+```rust
+std::process::Command::new("ssh")
+    .arg("myhost")
+    .arg("--")
+    .arg(join(my_cmd_args))
+```
+
+If any argument in `my_cmd_args` contains a nul byte, then `join(my_cmd_args)` will contain a nul
+byte.  But `join(my_cmd_args)` is itself being passed as an argument to a command (the ssh
+command), and command arguments can't contain nul bytes!  So this will simply result in the
+`Command` failing to launch.
+
+Still, there are other ways to smuggle nul bytes into a shell.  How the shell reacts depends on the
+shell and the method of smuggling.  For example, here is Bash 5.2.21 exhibiting three different
+behaviors:
+
+- With ANSI-C quoting, the string is truncated at the first nul byte:
+  ```bash
+  $ echo $'foo\0bar' | hexdump -C
+  00000000  66 6f 6f 0a                                       |foo.|
+  ```
+
+- With command substitution, nul bytes are removed with a warning:
+  ```bash
+  $ echo $(printf 'foo\0bar') | hexdump -C
+  bash: warning: command substitution: ignored null byte in input
+  00000000  66 6f 6f 62 61 72 0a                              |foobar.|
+  ```
+
+- When a nul byte appears directly in a shell script, it's removed with no warning:
+  ```bash
+  $ printf 'echo "foo\0bar"' | bash | hexdump -C
+  00000000  66 6f 6f 62 61 72 0a                              |foobar.|
+  ```
+
+Zsh, in contrast, actually allows nul bytes internally, in shell variables and even arguments to
+builtin commands.  But if a variable is exported to the environment, or if an argument is used for
+an external command, then the child process will see it silently truncated at the first nul.  This
+might actually be more dangerous, depending on the use case.
+
+## Overlong commands
+
+If you pass a long string into a shell, several things might happen:
+
+- It might succeed, yet the shell might have trouble actually doing anything with it.  For example:
+
+  ```bash
+  x=$(printf '%010000000d' 0); /bin/echo $x
+  bash: /bin/echo: Argument list too long
+  ```
+
+- If you're using certain shells (e.g. Busybox Ash) *and* using a pty for communication, then the
+  shell will impose a line length limit, ignoring all input past the limit.
+
+- If you're using a pty in cooked mode, then by default, if you write so many bytes as input that
+  it fills the kernel's internal buffer, the kernel will simply drop those bytes, instead of
+  blocking waiting for the shell to empty out the buffer.  In other words, random bits of input can
+  be lost, which is obviously insecure.
+
+Future versions of this crate may add an option to [`Quoter`] to check the length for you.
+
+## Control characters (*interactive contexts only*)
+
+Control characters are the bytes from `\x00` to `\x1f`, plus `\x7f`.  `\x00` (the nul byte) is
+discussed [above](#nul-bytes), but what about the rest?  Well, many of them correspond to terminal
+keyboard shortcuts.  For example, when you press Ctrl-A at a shell prompt, your terminal sends the
+byte `\x01`.  The shell sees that byte and (if not configured differently) takes the standard
+action for Ctrl-A, which is to move the cursor to the beginning of the line.
+
+This means that it's quite dangerous to pipe bytes to an interactive shell.  For example, here is a
+program that tries to tell Bash to echo an arbitrary string, 'safely':
+```rust
+use std::process::{Command, Stdio};
+use std::io::Write;
+
+let evil_string = "\x01do_something_evil; ";
+let quoted = shlex::try_quote(evil_string).unwrap();
+println!("quoted string is {:?}", quoted);
+
+let mut bash = Command::new("bash")
+    .arg("-i") // force interactive mode
+    .stdin(Stdio::piped())
+    .spawn()
+    .unwrap();
+let stdin = bash.stdin.as_mut().unwrap();
+write!(stdin, "echo {}\n", quoted).unwrap();
+```
+
+Here's the output of the program (with irrelevant bits removed):
+
+```text
+quoted string is "'\u{1}do_something_evil; '"
+/tmp comex$ do_something_evil; 'echo '
+bash: do_something_evil: command not found
+bash: echo : command not found
+```
+
+Even though we quoted it, Bash still ran an arbitrary command!
+
+This is not because the quoting was insufficient, per se.  In single quotes, all input is supposed
+to be treated as raw data until the closing single quote.  And in fact, this would work fine
+without the `"-i"` argument.
+
+But line input is a separate stage from shell syntax parsing.  After all, if you type a single
+quote on the keyboard, you wouldn't expect it to disable all your keyboard shortcuts.  So a control
+character always has its designated effect, no matter if it's quoted or backslash-escaped.
+
+Also, some control characters are interpreted by the kernel tty layer instead, like CTRL-C to send
+SIGINT.  These can be an issue even with noninteractive shells, but only if using a pty for
+communication, as opposed to a pipe.
+
+To be safe, you just have to avoid sending them.
+
+### Why not just use hex escapes?
+
+In any normal programming languages, this would be no big deal.
+
+Any normal language has a way to escape arbitrary characters in strings by writing out their
+numeric values.  For example, Rust lets you write them in hexadecimal, like `"\x4f"` (or
+`"\u{1d546}"` for Unicode).  In this way, arbitrary strings can be represented using only 'nice'
+simple characters.  Any remotely suspicious character can be replaced with a numeric escape
+sequence, where the escape sequence itself consists only of alphanumeric characters and some
+punctuation.  The result may not be the most readable[^choices], but it's quite safe from being
+misinterpreted or corrupted in transit.
+
+Shell is not normal.  It has no numeric escape sequences.
+
+There are a few different ways to quote characters (unquoted, unquoted-with-backslash, single
+quotes, double quotes), but all of them involve writing the character itself.  If the input
+contains a control character, the output must contain that same character.
+
+### Mitigation: terminal filters
+
+In practice, automating interactive shells like in the above example is pretty uncommon these days.
+In most cases, the only way for a programmatically generated string to make its way to the input of
+an interactive shell is if a human copies and pastes it into their terminal.
+
+And many terminals detect when you paste a string containing control characters.  iTerm2 strips
+them out; gnome-terminal replaces them with alternate characters[^gr]; Kitty outright prompts for
+confirmation.  This mitigates the risk.
+
+But it's not perfect.  Some other terminals don't implement this check or implement it incorrectly.
+Also, these checks tend to not filter the tab character, which could trigger tab completion.  In
+most cases that's a non-issue, because most shells support paste bracketing, which disables tab and
+some other control characters[^bracketing] within pasted text.  But in some cases paste bracketing
+gets disabled.
+
+### Future possibility: ANSI-C quoting
+
+I said that shell syntax has no numeric escapes, but that only applies to *portable* shell syntax.
+Bash and Zsh support an obscure alternate quoting style with the syntax `$'foo'`.  It's called
+["ANSI-C quoting"][ansic], and inside it you can use all the escape sequences supported by C,
+including hex escapes:
+
+```bash
+$ echo $'\x41\n\x42'
+A
+B
+```
+
+But other shells don't support it — including Dash, a popular choice for `/bin/sh`, and Busybox's
+Ash, frequently seen on stripped-down embedded systems.  This crate's quoting functionality [tries
+to be compatible](crate#compatibility) with those shells, plus all other POSIX-compatible shells.
+That makes ANSI-C quoting a no-go.
+
+Still, future versions of this crate may provide an option to enable ANSI-C quoting, at the cost of
+reduced portability.
+
+### Future possibility: printf
+
+Another option would be to invoke the `printf` command, which is required by POSIX to support octal
+escapes.  For example, you could 'escape' the Rust string `"\x01"` into the shell syntax `"$(printf
+'\001')"`.  The shell will execute the command `printf` with the first argument being literally a
+backslash followed by three digits; `printf` will output the actual byte with value 1; and the
+shell will substitute that back into the original command.
+
+The problem is that 'escaping' a string into a command substitution just feels too surprising.  If
+nothing else, it only works with an actual shell; [other languages' shell parsing
+routines](crate#compatibility) wouldn't understand it.  Neither would this crate's own parser,
+though that could be fixed.
+
+Future versions of this crate may provide an option to use `printf` for quoting.
+
+### Special note: newlines
+
+Did you know that `\r` and `\n` are control characters?  They aren't as dangerous as other control
+characters (if quoted properly).  But there's still an issue with them in interactive contexts.
+
+Namely, in some cases, interactive shells and/or the tty layer will 'helpfully' translate between
+different line ending conventions.  The possibilities include replacing `\r` with `\n`, replacing
+`\n` with `\r\n`, and others.  This can't result in command injection, but it's still a lossy
+transformation which can result in a failure to round-trip (i.e. the shell sees a different string
+from what was originally passed to `quote`).
+
+Numeric escapes would solve this as well.
+
+# Solved issues
+
+## Solved: Past vulnerability (GHSA-r7qv-8r2h-pg27 / RUSTSEC-2024-XXX)
+
+Versions of this crate before 1.3.0 did not quote `{`, `}`, and `\xa0`.
+
+See:
+- <https://github.com/advisories/GHSA-r7qv-8r2h-pg27>
+- (TODO: Add Rustsec link)
+
+## Solved: `!` and `^`
+
+There are two non-control characters which have a special meaning in interactive contexts only: `!` and
+`^`.  Luckily, these can be escaped adequately.
+
+The `!` character triggers [history expansion][he]; the `^` character can trigger a variant of
+history expansion known as [Quick Substitution][qs].  Both of these characters get expanded even
+inside of double-quoted strings\!
+
+If we're in a double-quoted string, then we can't just escape these characters with a backslash.
+Only a specific set of characters can be backslash-escaped inside double quotes; the set of
+supported characters depends on the shell, but it often doesn't include `!` and `^`.[^escbs]
+Trying to backslash-escape an unsupported character produces a literal backslash:
+```bash
+$ echo "\!"
+\!
+```
+
+However, these characters don't get expanded in single-quoted strings, so this crate just
+single-quotes them.
+
+But there's a Bash bug where `^` actually does get partially expanded in single-quoted strings:
+```bash
+$ echo '
+> ^a^b
+> '
+
+!!:s^a^b
+```
+
+To work around that, this crate forces `^` to appear right after an opening single quote.  For
+example, the string `"^` is quoted into `'"''^'` instead of `'"^'`.  This restriction is overkill,
+since `^` is only meaningful right after a newline, but it's a sufficient restriction (after all, a
+`^` character can't be preceded by a newline if it's forced to be preceded by a single quote), and
+for now it simplifies things.
+
+## Solved: `\xa0`
+
+The byte `\xa0` may be treated as a shell word separator, specifically on Bash on macOS when using
+the default UTF-8 locale, only when the input is invalid UTF-8.  This crate handles the issue by
+always using quotes for arguments containing this byte.
+
+In fact, this crate always uses quotes for arguments containing any non-ASCII bytes.  This may be
+changed in the future, since it's a bit unfriendly to non-English users.  But for now it
+minimizes risk, especially considering the large number of different legacy single-byte locales
+someone might hypothetically be running their shell in.
+
+### Demonstration
+
+```bash
+$ echo -e 'ls a\xa0b' | bash
+ls: a: No such file or directory
+ls: b: No such file or directory
+```
+The normal behavior would be to output a single line, e.g.:
+```bash
+$ echo -e 'ls a\xa0b' | bash
+ls: cannot access 'a'$'\240''b': No such file or directory
+```
+(The specific quoting in the error doesn't matter.)
+
+### Cause
+
+Just for fun, here's why this behavior occurs:
+
+Bash decides which bytes serve as word separators based on the libc function [`isblank`][isblank].
+On macOS on UTF-8 locales, this passes for `\xa0`, corresponding to U+00A0 NO-BREAK SPACE.
+
+This is doubly unique compared to the other systems I tested (Linux/glibc, Linux/musl, and
+Windows/MSVC).  First, the other systems don't allow bytes in the range [0x80, 0xFF] to pass
+<code>is<i>foo</i></code> functions in UTF-8 locales, even if the corresponding Unicode codepoint
+does pass, as determined by the wide-character equivalent function, <code>isw<i>foo</i></code>.
+Second, the other systems don't treat U+00A0 as blank (even using `iswblank`).
+
+Meanwhile, Bash checks for multi-byte sequences and forbids them from being treated as special
+characters, so the proper UTF-8 encoding of U+00A0, `b"\xc2\xa0"`, is not treated as a word
+separator.  Treatment as a word separator only happens for `b"\xa0"` alone, which is illegal UTF-8.
+
+[ansic]: https://www.gnu.org/software/bash/manual/html_node/ANSI_002dC-Quoting.html
+[he]: https://www.gnu.org/software/bash/manual/html_node/History-Interaction.html
+[qs]: https://www.gnu.org/software/bash/manual/html_node/Event-Designators.html
+[isblank]: https://man7.org/linux/man-pages/man3/isblank.3p.html
+[nul]: #nul-bytes
+
+[^choices]: This can lead to tough choices over which
+  characters to escape and which to leave as-is, especially when Unicode gets involved and you
+  have to balance the risk of confusion with the benefit of properly supporting non-English
+  languages.
+  <br>
+  <br>
+  We don't have the luxury of those choices.
+
+[^gr]: For example, backspace (in Unicode lingo, U+0008 BACKSPACE) turns into U+2408 SYMBOL FOR BACKSPACE.
+
+[^bracketing]: It typically disables almost all handling of control characters by the shell proper,
+    but one necessary exception is the end-of-paste sequence itself (which starts with the control
+    character `\x1b`).  In addition, paste bracketing does not suppress handling of control
+    characters by the kernel tty layer, such as `\x03` sending SIGINT (which typically clears the
+    currently typed command, making it dangerous in a similar way to `\x01`).
+
+[^escbs]: For example, Dash doesn't remove the backslash from `"\!"` because it simply doesn't know
+    anything about `!` as a special character: it doesn't support history expansion.  On the other
+    end of the spectrum, Zsh supports history expansion and does remove the backslash — though only
+    in interactive mode.  Bash's behavior is weirder.  It supports history expansion, and if you
+    write `"\!"`, the backslash does prevent history expansion from occurring — but it doesn't get
+    removed!
+
+*/
+
+// `use` declarations to make auto links work:
+use ::{quote, join, Shlex, Quoter, QuoteError};
+
+// TODO: add more about copy-paste and human readability.
author	Jeff Vander Stoep <jeffv@google.com>	2024-02-06 12:04:30 +0000
committer	Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>	2024-02-06 12:04:30 +0000
commit	52363e2196b542f4f5af7c50f54d709b0ae9a76f (patch)
tree	4b29caddb5fb6ef3d61025c0e200e7447449c8d0
parent	35e1d8b055ec3348dae989b958938175ffa067b4 (diff)
parent	bce9dd1ed2cdcf01509b6fbf5302ffaa7714e935 (diff)
download	shlex-emu-34-2-dev.tar.gz