// Copyright 2020 The Pigweed Authors // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy of // the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the // License for the specific language governing permissions and limitations under // the License. #include "pw_tokenizer/detokenize.h" #include #include #include "pw_bytes/bit.h" #include "pw_bytes/endian.h" #include "pw_tokenizer/base64.h" #include "pw_tokenizer/internal/decode.h" #include "pw_tokenizer/nested_tokenization.h" namespace pw::tokenizer { namespace { class NestedMessageDetokenizer { public: NestedMessageDetokenizer(const Detokenizer& detokenizer) : detokenizer_(detokenizer) {} void Detokenize(std::string_view chunk) { for (char next_char : chunk) { Detokenize(next_char); } } void Detokenize(char next_char) { switch (state_) { case kNonMessage: if (next_char == PW_TOKENIZER_NESTED_PREFIX) { message_buffer_.push_back(next_char); state_ = kMessage; } else { output_.push_back(next_char); } break; case kMessage: if (base64::IsValidChar(next_char)) { message_buffer_.push_back(next_char); } else { HandleEndOfMessage(); if (next_char == PW_TOKENIZER_NESTED_PREFIX) { message_buffer_.push_back(next_char); } else { output_.push_back(next_char); state_ = kNonMessage; } } break; } } std::string Flush() { if (state_ == kMessage) { HandleEndOfMessage(); state_ = kNonMessage; } return std::move(output_); } private: void HandleEndOfMessage() { if (auto result = detokenizer_.DetokenizeBase64Message(message_buffer_); result.ok()) { output_ += result.BestString(); } else { output_ += message_buffer_; // Keep the original if it doesn't decode. } message_buffer_.clear(); } const Detokenizer& detokenizer_; std::string output_; std::string message_buffer_; enum { kNonMessage, kMessage } state_ = kNonMessage; }; std::string UnknownTokenMessage(uint32_t value) { std::string output(PW_TOKENIZER_ARG_DECODING_ERROR_PREFIX "unknown token "); // Output a hexadecimal version of the token. for (int shift = 28; shift >= 0; shift -= 4) { output.push_back("0123456789abcdef"[(value >> shift) & 0xF]); } output.append(PW_TOKENIZER_ARG_DECODING_ERROR_SUFFIX); return output; } // Decoding result with the date removed, for sorting. using DecodingResult = std::pair; // Determines if one result is better than the other if collisions occurred. // Returns true if lhs is preferred over rhs. This logic should match the // collision resolution logic in detokenize.py. bool IsBetterResult(const DecodingResult& lhs, const DecodingResult& rhs) { // Favor the result for which decoding succeeded. if (lhs.first.ok() != rhs.first.ok()) { return lhs.first.ok(); } // Favor the result for which all bytes were decoded. if ((lhs.first.remaining_bytes() == 0u) != (rhs.first.remaining_bytes() == 0u)) { return lhs.first.remaining_bytes() == 0u; } // Favor the result with fewer decoding errors. if (lhs.first.decoding_errors() != rhs.first.decoding_errors()) { return lhs.first.decoding_errors() < rhs.first.decoding_errors(); } // Favor the result that successfully decoded the most arguments. if (lhs.first.argument_count() != rhs.first.argument_count()) { return lhs.first.argument_count() > rhs.first.argument_count(); } // Favor the result that was removed from the database most recently. return lhs.second > rhs.second; } } // namespace DetokenizedString::DetokenizedString( uint32_t token, const span& entries, const span& arguments) : token_(token), has_token_(true) { std::vector results; for (const auto& [format, date_removed] : entries) { results.push_back(DecodingResult{format.Format(arguments), date_removed}); } std::sort(results.begin(), results.end(), IsBetterResult); for (auto& result : results) { matches_.push_back(std::move(result.first)); } } std::string DetokenizedString::BestString() const { return matches_.empty() ? std::string() : matches_[0].value(); } std::string DetokenizedString::BestStringWithErrors() const { if (matches_.empty()) { return has_token_ ? UnknownTokenMessage(token_) : PW_TOKENIZER_ARG_DECODING_ERROR("missing token"); } return matches_[0].value_with_errors(); } Detokenizer::Detokenizer(const TokenDatabase& database) { for (const auto& entry : database) { database_[entry.token].emplace_back(entry.string, entry.date_removed); } } DetokenizedString Detokenizer::Detokenize( const span& encoded) const { // The token is missing from the encoded data; there is nothing to do. if (encoded.empty()) { return DetokenizedString(); } uint32_t token = bytes::ReadInOrder( endian::little, encoded.data(), encoded.size()); const auto result = database_.find(token); return DetokenizedString( token, result == database_.end() ? span() : span(result->second), encoded.size() < sizeof(token) ? span() : encoded.subspan(sizeof(token))); } DetokenizedString Detokenizer::DetokenizeBase64Message( std::string_view text) const { std::string buffer(text); buffer.resize(PrefixedBase64DecodeInPlace(buffer)); return Detokenize(buffer); } std::string Detokenizer::DetokenizeBase64(std::string_view text) const { NestedMessageDetokenizer nested_detokenizer(*this); nested_detokenizer.Detokenize(text); return nested_detokenizer.Flush(); } } // namespace pw::tokenizer