// Copyright 2011 the V8 project authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "src/parsing/scanner-character-streams.h" #include "include/v8.h" #include "src/counters.h" #include "src/globals.h" #include "src/handles.h" #include "src/objects-inl.h" #include "src/parsing/scanner.h" #include "src/unicode-inl.h" namespace v8 { namespace internal { namespace { const unibrow::uchar kUtf8Bom = 0xFEFF; } // namespace template struct HeapStringType; template <> struct HeapStringType { typedef SeqOneByteString String; }; template <> struct HeapStringType { typedef SeqTwoByteString String; }; template struct Range { const Char* start; const Char* end; size_t length() { return static_cast(end - start); } bool unaligned_start() const { return reinterpret_cast(start) % sizeof(Char) == 1; } }; // A Char stream backed by an on-heap SeqOneByteString or SeqTwoByteString. template class OnHeapStream { public: typedef typename HeapStringType::String String; OnHeapStream(Handle string, size_t start_offset, size_t end) : string_(string), start_offset_(start_offset), length_(end) {} Range GetDataAt(size_t pos) { return {&string_->GetChars()[start_offset_ + Min(length_, pos)], &string_->GetChars()[start_offset_ + length_]}; } static const bool kCanAccessHeap = true; private: Handle string_; const size_t start_offset_; const size_t length_; }; // A Char stream backed by an off-heap ExternalOneByteString or // ExternalTwoByteString. template class ExternalStringStream { public: ExternalStringStream(const Char* data, size_t end) : data_(data), length_(end) {} Range GetDataAt(size_t pos) { return {&data_[Min(length_, pos)], &data_[length_]}; } static const bool kCanAccessHeap = false; private: const Char* const data_; const size_t length_; }; // A Char stream backed by multiple source-stream provided off-heap chunks. template class ChunkedStream { public: ChunkedStream(ScriptCompiler::ExternalSourceStream* source, RuntimeCallStats* stats) : source_(source), stats_(stats) {} Range GetDataAt(size_t pos) { Chunk chunk = FindChunk(pos); size_t buffer_end = chunk.length; size_t buffer_pos = Min(buffer_end, pos - chunk.position); return {&chunk.data[buffer_pos], &chunk.data[buffer_end]}; } ~ChunkedStream() { for (Chunk& chunk : chunks_) delete[] chunk.data; } static const bool kCanAccessHeap = false; private: struct Chunk { Chunk(const Char* const data, size_t position, size_t length) : data(data), position(position), length(length) {} const Char* const data; // The logical position of data. const size_t position; const size_t length; size_t end_position() const { return position + length; } }; Chunk FindChunk(size_t position) { while (V8_UNLIKELY(chunks_.empty())) FetchChunk(size_t{0}); // Walk forwards while the position is in front of the current chunk. while (position >= chunks_.back().end_position() && chunks_.back().length > 0) { FetchChunk(chunks_.back().end_position()); } // Walk backwards. for (auto reverse_it = chunks_.rbegin(); reverse_it != chunks_.rend(); ++reverse_it) { if (reverse_it->position <= position) return *reverse_it; } UNREACHABLE(); } virtual void ProcessChunk(const uint8_t* data, size_t position, size_t length) { // Incoming data has to be aligned to Char size. DCHECK_EQ(0, length % sizeof(Char)); chunks_.emplace_back(reinterpret_cast(data), position, length / sizeof(Char)); } void FetchChunk(size_t position) { const uint8_t* data = nullptr; size_t length; { RuntimeCallTimerScope scope(stats_, RuntimeCallCounterId::kGetMoreDataCallback); length = source_->GetMoreData(&data); } ProcessChunk(data, position, length); } ScriptCompiler::ExternalSourceStream* source_; RuntimeCallStats* stats_; protected: std::vector chunks_; }; template class Utf8ChunkedStream : public ChunkedStream { public: Utf8ChunkedStream(ScriptCompiler::ExternalSourceStream* source, RuntimeCallStats* stats) : ChunkedStream(source, stats) {} STATIC_ASSERT(sizeof(Char) == sizeof(uint16_t)); void ProcessChunk(const uint8_t* data, size_t position, size_t length) final { if (length == 0) { unibrow::uchar t = unibrow::Utf8::ValueOfIncrementalFinish(&state_); if (t != unibrow::Utf8::kBufferEmpty) { DCHECK_EQ(t, unibrow::Utf8::kBadChar); incomplete_char_ = 0; uint16_t* result = new uint16_t[1]; result[0] = unibrow::Utf8::kBadChar; chunks_.emplace_back(result, position, 1); position++; } chunks_.emplace_back(nullptr, position, 0); delete[] data; return; } // First count the number of complete characters that can be produced. unibrow::Utf8::State state = state_; uint32_t incomplete_char = incomplete_char_; bool seen_bom = seen_bom_; size_t i = 0; size_t chars = 0; while (i < length) { unibrow::uchar t = unibrow::Utf8::ValueOfIncremental(data[i], &i, &state, &incomplete_char); if (!seen_bom && t == kUtf8Bom && position + chars == 0) { seen_bom = true; // BOM detected at beginning of the stream. Don't copy it. } else if (t != unibrow::Utf8::kIncomplete) { chars++; if (t > unibrow::Utf16::kMaxNonSurrogateCharCode) chars++; } } // Process the data. // If there aren't any complete characters, update the state without // producing a chunk. if (chars == 0) { state_ = state; incomplete_char_ = incomplete_char; seen_bom_ = seen_bom; delete[] data; return; } // Update the state and produce a chunk with complete characters. uint16_t* result = new uint16_t[chars]; uint16_t* cursor = result; i = 0; while (i < length) { unibrow::uchar t = unibrow::Utf8::ValueOfIncremental(data[i], &i, &state_, &incomplete_char_); if (V8_LIKELY(t < kUtf8Bom)) { *(cursor++) = static_cast(t); // The by most frequent case. } else if (t == unibrow::Utf8::kIncomplete) { continue; } else if (!seen_bom_ && t == kUtf8Bom && position == 0 && cursor == result) { // BOM detected at beginning of the stream. Don't copy it. seen_bom_ = true; } else if (t <= unibrow::Utf16::kMaxNonSurrogateCharCode) { *(cursor++) = static_cast(t); } else { *(cursor++) = unibrow::Utf16::LeadSurrogate(t); *(cursor++) = unibrow::Utf16::TrailSurrogate(t); } } chunks_.emplace_back(result, position, chars); delete[] data; } private: uint32_t incomplete_char_ = 0; unibrow::Utf8::State state_ = unibrow::Utf8::State::kAccept; bool seen_bom_ = false; }; // Provides a buffered utf-16 view on the bytes from the underlying ByteStream. // Chars are buffered if either the underlying stream isn't utf-16 or the // underlying utf-16 stream might move (is on-heap). template