diff options
Diffstat (limited to 'src/binary_parse')
-rwxr-xr-x | src/binary_parse/cached_paged_byte_array.cc | 79 | ||||
-rwxr-xr-x | src/binary_parse/cached_paged_byte_array.h | 73 | ||||
-rwxr-xr-x | src/binary_parse/range_checked_byte_ptr.cc | 402 | ||||
-rwxr-xr-x | src/binary_parse/range_checked_byte_ptr.h | 609 |
4 files changed, 1163 insertions, 0 deletions
diff --git a/src/binary_parse/cached_paged_byte_array.cc b/src/binary_parse/cached_paged_byte_array.cc new file mode 100755 index 0000000..83b6c03 --- /dev/null +++ b/src/binary_parse/cached_paged_byte_array.cc @@ -0,0 +1,79 @@ +// Copyright 2015 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//////////////////////////////////////////////////////////////////////////////// +// +// The cache layer works as follows: +// The cache is implemented as a vector (of size 'cache_size') of shared +// pointers to pages recently used. The least recently used page is stored +// at the begining of the vector, the most recent at the end. + +#include "src/binary_parse/cached_paged_byte_array.h" + +#include <cstddef> + +namespace piex { +namespace binary_parse { + +CachedPagedByteArray::CachedPagedByteArray( + const PagedByteArray* paged_byte_array, size_t cache_size) + : paged_byte_array_(paged_byte_array), cache_size_(cache_size) {} + +void CachedPagedByteArray::getPage(size_t page_index, + const unsigned char** begin, + const unsigned char** end, + PagedByteArray::PagePtr* page) const { + std::lock_guard<std::mutex> lock(mutex_); + size_t cache_index; + if (getFromCache(page_index, &cache_index)) { + // Cache hit, retrieve the page from the cache. + *begin = cached_pages_[cache_index].begin; + *end = cached_pages_[cache_index].end; + *page = cached_pages_[cache_index].page; + + // Remove the page to insert it at the end of the cache later. + cached_pages_.erase(cached_pages_.begin() + + static_cast<std::ptrdiff_t>(cache_index)); + } else { + // Cache miss, ask PagedByteArray to load the page. + paged_byte_array_->getPage(page_index, begin, end, page); + + // If the cache is full, remove the first (least recently used) page. + if (cached_pages_.size() >= cache_size_) { + cached_pages_.erase(cached_pages_.begin()); + } + } + + // Cache the most recently used page to the end of the vector. + CachedPage cache_page; + cache_page.index = page_index; + cache_page.page = *page; + cache_page.begin = *begin; + cache_page.end = *end; + cached_pages_.push_back(cache_page); +} + +bool CachedPagedByteArray::getFromCache(size_t page_index, + size_t* cache_index) const { + for (size_t i = 0; i < cached_pages_.size(); ++i) { + if (cached_pages_[i].index == page_index) { + *cache_index = i; + return true; + } + } + return false; +} + +} // namespace binary_parse +} // namespace piex diff --git a/src/binary_parse/cached_paged_byte_array.h b/src/binary_parse/cached_paged_byte_array.h new file mode 100755 index 0000000..26f0eae --- /dev/null +++ b/src/binary_parse/cached_paged_byte_array.h @@ -0,0 +1,73 @@ +// Copyright 2015 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//////////////////////////////////////////////////////////////////////////////// +// +// LRU cache decorator for binary_parse::PagedByteArray subclasses. + +#ifndef PIEX_BINARY_PARSE_CACHED_PAGED_BYTE_ARRAY_H_ +#define PIEX_BINARY_PARSE_CACHED_PAGED_BYTE_ARRAY_H_ + +#include <mutex> +#include <vector> + +#if !defined(WIN32_LEAN_AND_MEAN) +#define WIN32_LEAN_AND_MEAN +#endif +#include "src/binary_parse/range_checked_byte_ptr.h" + +namespace piex { +namespace binary_parse { + +class CachedPagedByteArray : public PagedByteArray { + public: + // Decorates 'paged_byte_array' with a LRU cache layer of the size + // 'cache_size'. + explicit CachedPagedByteArray(const PagedByteArray* paged_byte_array, + size_t cache_size); + + virtual size_t length() const { return paged_byte_array_->length(); } + + virtual size_t pageSize() const { return paged_byte_array_->pageSize(); } + + virtual void getPage(size_t page_index, const unsigned char** begin, + const unsigned char** end, + PagedByteArray::PagePtr* page) const; + + private: + struct CachedPage { + size_t index; + PagedByteArray::PagePtr page; + const unsigned char* begin; + const unsigned char* end; + }; + + // Disallow copy construction and assignment. + CachedPagedByteArray(const CachedPagedByteArray&); + void operator=(const CachedPagedByteArray&); + + // Gets the index of the page if it is in the cache and returns true, else + // returns false. + bool getFromCache(size_t page_index, size_t* cache_index) const; + + mutable std::mutex mutex_; + const PagedByteArray* paged_byte_array_; + const size_t cache_size_; + mutable std::vector<CachedPage> cached_pages_; +}; + +} // namespace binary_parse +} // namespace piex + +#endif // PIEX_BINARY_PARSE_CACHED_PAGED_BYTE_ARRAY_H_ diff --git a/src/binary_parse/range_checked_byte_ptr.cc b/src/binary_parse/range_checked_byte_ptr.cc new file mode 100755 index 0000000..bbfdee2 --- /dev/null +++ b/src/binary_parse/range_checked_byte_ptr.cc @@ -0,0 +1,402 @@ +// Copyright 2015 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//////////////////////////////////////////////////////////////////////////////// + +#include "src/binary_parse/range_checked_byte_ptr.h" + +#include <assert.h> +#include <cstddef> +#include <cstring> + +namespace piex { +namespace binary_parse { + +#ifdef BREAK_IF_DEBUGGING_AND_OUT_OF_RANGE +#define BREAK_IF_DEBUGGING() assert(false) +#else +#define BREAK_IF_DEBUGGING() assert(true) +#endif + +namespace { +class MemoryPagedByteArray : public PagedByteArray { + public: + MemoryPagedByteArray(const unsigned char *buffer, const size_t len); + + virtual size_t length() const; + virtual size_t pageSize() const; + virtual void getPage(size_t page_index, const unsigned char **begin, + const unsigned char **end, PagePtr *page) const; + + private: + const unsigned char *buffer_; + const size_t len_; +}; + +MemoryPagedByteArray::MemoryPagedByteArray(const unsigned char *buffer, + const size_t len) + : buffer_(buffer), len_(len) {} + +size_t MemoryPagedByteArray::length() const { return len_; } + +size_t MemoryPagedByteArray::pageSize() const { return len_; } + +void MemoryPagedByteArray::getPage(size_t /* page_index */, + const unsigned char **begin, + const unsigned char **end, + PagePtr *page) const { + *begin = buffer_; + *end = buffer_ + len_; + *page = PagePtr(); +} + +// A functor that does nothing. This is used as a no-op shared pointer +// deallocator below. +class NullFunctor { + public: + void operator()() {} + void operator()(PagedByteArray * /* p */) const {} +}; +} // namespace + +PagedByteArray::~PagedByteArray() {} + +RangeCheckedBytePtr::RangeCheckedBytePtr() + : array_(), + page_data_(NULL), + current_pos_(0), + sub_array_begin_(0), + sub_array_end_(0), + page_begin_offset_(0), + current_page_len_(0), + error_flag_(RANGE_CHECKED_BYTE_ERROR) {} + +RangeCheckedBytePtr::RangeCheckedBytePtr(const unsigned char *array, + const size_t len) + : array_(new MemoryPagedByteArray(array, len)), + page_data_(NULL), + current_pos_(0), + sub_array_begin_(0), + sub_array_end_(len), + page_begin_offset_(0), + current_page_len_(0), + error_flag_(RANGE_CHECKED_BYTE_SUCCESS) { + assert(array); + if (array == NULL) { + error_flag_ = RANGE_CHECKED_BYTE_ERROR; + } +} + +RangeCheckedBytePtr::RangeCheckedBytePtr(PagedByteArray *array) + : array_(array, NullFunctor()), + page_data_(NULL), + current_pos_(0), + sub_array_begin_(0), + sub_array_end_(array->length()), + page_begin_offset_(0), + current_page_len_(0), + error_flag_(RANGE_CHECKED_BYTE_SUCCESS) {} + +RangeCheckedBytePtr RangeCheckedBytePtr::invalidPointer() { + return RangeCheckedBytePtr(); +} + +RangeCheckedBytePtr RangeCheckedBytePtr::pointerToSubArray( + size_t pos, size_t length) const { + RangeCheckedBytePtr sub_result = (*this) + pos; + if (!sub_result.errorOccurred() && length <= sub_result.remainingLength()) { + sub_result.sub_array_begin_ = sub_result.current_pos_; + sub_result.sub_array_end_ = sub_result.sub_array_begin_ + length; + + // Restrict the boundaries of the current page to the newly set sub-array. + sub_result.restrictPageToSubArray(); + + return sub_result; + } else { + error_flag_ = RANGE_CHECKED_BYTE_ERROR; + return invalidPointer(); + } +} + +size_t RangeCheckedBytePtr::offsetInArray() const { + // sub_array_begin_ <= current_pos_ is a class invariant, but protect + // against violations of this invariant. + if (sub_array_begin_ <= current_pos_) { + return current_pos_ - sub_array_begin_; + } else { + assert(false); + return 0; + } +} + +std::string RangeCheckedBytePtr::substr(size_t pos, size_t length) const { + std::vector<unsigned char> bytes = extractBytes(pos, length); + std::string result; + result.reserve(bytes.size()); + for (size_t i = 0; i < bytes.size(); ++i) { + result.push_back(static_cast<char>(bytes[i])); + } + return result; +} + +std::vector<unsigned char> RangeCheckedBytePtr::extractBytes( + size_t pos, size_t length) const { + std::vector<unsigned char> result; + if (pos + length < pos /* overflow */ || remainingLength() < pos + length) { + BREAK_IF_DEBUGGING(); + error_flag_ = RANGE_CHECKED_BYTE_ERROR_OVERFLOW; + return result; + } + result.reserve(length); + for (size_t i = 0; i < length; ++i) { + result.push_back((*this)[pos + i]); + } + return result; +} + +bool operator==(const RangeCheckedBytePtr &x, const RangeCheckedBytePtr &y) { + if (x.array_ != y.array_) { + assert(false); + return false; + } + + return x.current_pos_ == y.current_pos_; +} + +bool operator!=(const RangeCheckedBytePtr &x, const RangeCheckedBytePtr &y) { + return !(x == y); +} + +void RangeCheckedBytePtr::loadPageForOffset(size_t offset) const { + // The offset should always lie within the bounds of the sub-array (this + // condition is enforced at the callsite). However, even if the offset lies + // outside the sub-array, the restrictPageToSubArray() call at the end + // ensures that the object is left in a consistent state that maintains the + // class invariants. + assert(offset >= sub_array_begin_ && offset < sub_array_end_); + + // Ensure that offset lies within the array. + if (offset >= array_->length()) { + assert(false); + return; + } + + // Determine the index of the page to request. + size_t page_index = offset / array_->pageSize(); + + // Get the page. + const unsigned char *page_begin; + const unsigned char *page_end; + array_->getPage(page_index, &page_begin, &page_end, &page_); + + // Ensure that the page has the expected length (as specified in the + // PagedByteArray interface). + size_t expected_page_size = array_->pageSize(); + if (page_index == (array_->length() - 1) / array_->pageSize()) { + expected_page_size = array_->length() - array_->pageSize() * page_index; + } + if ((page_end < page_begin) || + (static_cast<size_t>(page_end - page_begin) != expected_page_size)) { + assert(false); + return; + } + + // Remember information about page. + page_data_ = page_begin; + page_begin_offset_ = page_index * array_->pageSize(); + current_page_len_ = static_cast<size_t>(page_end - page_begin); + + // Restrict the boundaries of the page to lie within the sub-array. + restrictPageToSubArray(); +} + +void RangeCheckedBytePtr::restrictPageToSubArray() const { + // Restrict the current page's boundaries so that it is always contained + // completely within the extent of the sub-array. + // This function is purposely designed to work correctly in the following + // two special cases: + // a) The current page lies entirely outside the sub-array. In this case, + // current_page_len_ will be set to zero. page_data_ may either remain + // unchanged or may be changed to point one element beyond the end of the + // page, depending on whether the current page lies before or after the + // sub-array. + // b) The current page is in the state as initialized by the constructor + // (i.e. page_data_ is NULL and current_page_len_ is zero). In this case, + // page_data_ and current_page_len_ will remain unchanged. + + // Does the beginning of the page lie before the beginning of the sub-array? + if (page_begin_offset_ < sub_array_begin_) { + // Compute amount by which to shorten page. + size_t amount_to_shorten = sub_array_begin_ - page_begin_offset_; + if (amount_to_shorten > current_page_len_) { + amount_to_shorten = current_page_len_; + } + + // Adjust beginning of page accordingly. + page_begin_offset_ += amount_to_shorten; + page_data_ += amount_to_shorten; + current_page_len_ -= amount_to_shorten; + } + + // Does the end of the page lie beyond the end of the sub-array? + if (page_begin_offset_ + current_page_len_ > sub_array_end_) { + // Reduce length of page accordingly. + size_t new_len = sub_array_end_ - page_begin_offset_; + if (new_len > current_page_len_) { + new_len = current_page_len_; + } + current_page_len_ = new_len; + } +} + +int memcmp(const RangeCheckedBytePtr &x, const RangeCheckedBytePtr &y, + size_t num) { + std::vector<unsigned char> x_vec = x.extractBytes(0, num); + std::vector<unsigned char> y_vec = y.extractBytes(0, num); + + if (!x.errorOccurred() && !y.errorOccurred()) { + return ::memcmp(&x_vec[0], &y_vec[0], num); + } else { + // return an arbitrary value + return -1; + } +} + +int strcmp(const RangeCheckedBytePtr &x, const std::string &y) { + std::vector<unsigned char> x_vec = x.extractBytes(0, y.length()); + + if (!x.errorOccurred()) { + return ::memcmp(&x_vec[0], y.c_str(), y.length()); + } else { + // return an arbitrary value + return -1; + } +} + +size_t strlen(const RangeCheckedBytePtr &src) { + size_t len = 0; + RangeCheckedBytePtr str = src; + while (!str.errorOccurred() && (str[0] != '\0')) { + str++; + len++; + } + return len; +} + +int16 Get16s(const RangeCheckedBytePtr &input, const bool big_endian, + MemoryStatus *status) { + const uint16 unsigned_value = Get16u(input, big_endian, status); + if (*status != RANGE_CHECKED_BYTE_SUCCESS) { + // Return an arbitrary value. + return 0; + } + + // Convert the two's-complement signed integer encoded in 'unsigned_value' + // into a signed representation in the implementation's native representation + // for signed integers. An optimized Blaze build (x64) compiles all of the + // following code to a no-op (as of this writing). + // For further details, see the corresponding comment in Get32s(). + if (unsigned_value == 0x8000u) { + return static_cast<int16>(-0x8000); + } else if (unsigned_value > 0x8000u) { + return -static_cast<int16>(0x10000u - unsigned_value); + } else { + return static_cast<int16>(unsigned_value); + } +} + +uint16 Get16u(const RangeCheckedBytePtr &input, const bool big_endian, + MemoryStatus *status) { + if (input.remainingLength() < 2) { + if (status && *status == RANGE_CHECKED_BYTE_SUCCESS) { + *status = RANGE_CHECKED_BYTE_ERROR; + } + // Return an arbitrary value. + return 0; + } + if (big_endian) { + return (static_cast<uint16>(input[0]) << 8) | static_cast<uint16>(input[1]); + } else { + return (static_cast<uint16>(input[1]) << 8) | static_cast<uint16>(input[0]); + } +} + +int32 Get32s(const RangeCheckedBytePtr &input, const bool big_endian, + MemoryStatus *status) { + const uint32 unsigned_value = Get32u(input, big_endian, status); + if (*status != RANGE_CHECKED_BYTE_SUCCESS) { + // Return an arbitrary value. + return 0; + } + + // Convert the two's-complement signed integer encoded in 'unsigned_value' + // into a signed representation in the implementation's native representation + // for signed integers. + // For all practical purposes, the same result could be obtained simply by + // casting unsigned_value to int32; the result of this is + // implementation-defined, but on all of the platforms we care about, it does + // what we want. + // The code below, however, arguably has the aesthetic advantage of being + // independent of the representation for signed integers chosen by the + // implementation, as long as 'int' and 'unsigned' have the required range to + // represent all of the required values. + // An optimized Blaze build (x64) compiles all of the following code to a + // no-op (as of this writing); i.e. the value that Get32u() returned in %eax + // is left unchanged. + if (unsigned_value == 0x80000000u) { + // Read here on why the constant expression is written this way: + // http://stackoverflow.com/questions/14695118 + return -0x7fffffff - 1; + } else if (unsigned_value > 0x80000000u) { + // The expression + // 0xffffffffu - unsigned_value + 1 + // is a portable way of flipping the sign of a twos-complement signed + // integer whose binary representation is stored in an unsigned integer. + // '0xffffffffu + 1' is used in preference to simply '0' because it makes + // it clearer that the correct result will be obtained even if an int is + // greater than 32 bits. The '0xffffffffu + 1' is "spread out" around + // 'unsigned_value' to prevent the compiler from warning about an + // integral constant overflow. ('0' would produce the correct result in + // this case too but would rely in a more subtle way on the rules for + // unsigned wraparound.) + return -static_cast<int32>(0xffffffffu - unsigned_value + 1); + } else { + return static_cast<int32>(unsigned_value); + } +} + +uint32 Get32u(const RangeCheckedBytePtr &input, const bool big_endian, + MemoryStatus *status) { + if (input.remainingLength() < 4) { + if (status && *status == RANGE_CHECKED_BYTE_SUCCESS) { + *status = RANGE_CHECKED_BYTE_ERROR; + } + // Return an arbitrary value. + return 0; + } + if (big_endian) { + return (static_cast<uint32>(input[0]) << 24) | + (static_cast<uint32>(input[1]) << 16) | + (static_cast<uint32>(input[2]) << 8) | + (static_cast<uint32>(input[3]) << 0); + } else { + return (static_cast<uint32>(input[3]) << 24) | + (static_cast<uint32>(input[2]) << 16) | + (static_cast<uint32>(input[1]) << 8) | + (static_cast<uint32>(input[0]) << 0); + } +} + +} // namespace binary_parse +} // namespace piex diff --git a/src/binary_parse/range_checked_byte_ptr.h b/src/binary_parse/range_checked_byte_ptr.h new file mode 100755 index 0000000..a0eadbb --- /dev/null +++ b/src/binary_parse/range_checked_byte_ptr.h @@ -0,0 +1,609 @@ +// Copyright 2015 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef PIEX_BINARY_PARSE_RANGE_CHECKED_BYTE_PTR_H_ +#define PIEX_BINARY_PARSE_RANGE_CHECKED_BYTE_PTR_H_ + +#include <assert.h> + +#include <cstddef> +#include <memory> +#include <string> +#include <vector> + +namespace piex { +namespace binary_parse { + +// Since NaCl does not comply to C++11 we can not just use stdint.h. +typedef unsigned short uint16; // NOLINT +typedef short int16; // NOLINT +typedef unsigned int uint32; +typedef int int32; + +enum MemoryStatus { + RANGE_CHECKED_BYTE_SUCCESS = 0, + RANGE_CHECKED_BYTE_ERROR = 1, + RANGE_CHECKED_BYTE_ERROR_OVERFLOW = 2, + RANGE_CHECKED_BYTE_ERROR_UNDERFLOW = 3, +}; + +// Interface that RangeCheckedBytePtr uses to access the underlying array of +// bytes. This allows RangeCheckedBytePtr to be used to access data as if it +// were stored contiguously in memory, even if the data is in fact split up +// into non-contiguous chunks and / or does not reside in memory. +// +// The only requirement is that the data can be read in pages of a fixed (but +// configurable) size. Notionally, the byte array (which contains length() +// bytes) is split up into non-overlapping pages of pageSize() bytes each. +// (The last page may be shorter if length() is not a multiple of pageSize().) +// There are therefore (length() - 1) / pageSize() + 1 such pages, with indexes +// 0 through (length() - 1) / pageSize(). Page i contains the bytes from offset +// i * pageSize() in the array up to and including the byte at offset +// (i + 1) * pageSize() - 1 (or, in the case of the last page, length() - 1). +// +// In essence, RangeCheckedBytePtr and PagedByteArray together provide a poor +// man's virtual-memory-and-memory-mapped-file work-alike in situations where +// virtual memory cannot be used or would consume too much virtual address +// space. +// +// Thread safety: In general, subclasses implementing this interface should +// ensure that the member functions are thread-safe. It will then be safe to +// access the same array from multiple threads. (Note that RangeCheckedBytePtr +// itself is not thread-safe in the sense that a single instance of +// RangeCheckedBytePtr cannot be used concurrently from multiple threads; it +// is, however, safe to use different RangeCheckedBytePtr instances in +// different threads to access the same PagedByteArray concurrently, assuming +// that the PagedByteArray implementation is thread-safe.) +class PagedByteArray { + public: + // Base class for pages in the byte array. Implementations of PagedByteArray + // can create a subclass of the Page class to manage the lifetime of buffers + // associated with a page returned by getPage(). For example, a + // PagedByteArray backed by a file might define a Page subclass like this: + // + // class FilePage : public Page { + // std::vector<unsigned char> bytes; + // }; + // + // The corresponding getPage() implementation could then look like this: + // + // void getPage(size_t page_index, const unsigned char** begin, + // const unsigned char** end, std::shared_ptr<Page>* page) + // { + // // Create a new page. + // std::shared_ptr<FilePage> file_page(new FilePage()); + // + // // Read contents of page from file into file_page->bytes. + // [...] + // + // // Set *begin and *end to point to beginning and end of + // // file_page->bytes vector. + // *begin = &file_page->bytes[0]; + // *end = *begin + file_page->bytes.size(); + // + // // Return page to caller + // *page = file_page; + // } + // + // In this way, the storage associated with the page (the FilePage::bytes + // vector) will be kept alive until the RangeCheckedBytePtr releases the + // shared pointer. + class Page {}; + + typedef std::shared_ptr<Page> PagePtr; + + virtual ~PagedByteArray(); + + // Returns the length of the array in bytes. The value returned must remain + // the same on every call for the entire lifetime of the object. + virtual size_t length() const = 0; + + // Returns the length of each page in bytes. (The last page may be shorter + // than pageSize() if length() is not a multiple of pageSize() -- see also + // the class-wide comment above.) The value returned must remain the same on + // every call for the entire lifetime of the object. + virtual size_t pageSize() const = 0; + + // Returns a pointer to a memory buffer containing the data for the page + // with index "page_index". + // + // *begin is set to point to the first byte of the page; *end is set to point + // one byte beyond the last byte in the page. This implies that: + // - (*end - *begin) == pageSize() for every page except the last page + // - (*end - *begin) == length() - pageSize() * ((length() - 1) / pageSize()) + // for the last page + // + // *page will be set to a SharedPtr that the caller will hold on to until + // it no longer needs to access the memory buffer. The memory buffer will + // remain valid until the SharedPtr is released or the PagedByteArray object + // is destroyed. An implementation may choose to return a null SharedPtr; + // this indicates that the memory buffer will remain valid until the + // PagedByteArray object is destroyed, even if the caller does not hold on to + // the SharedPtr. (This is intended as an optimization that some + // implementations may choose to take advantage of, as a null SharedPtr is + // cheaper to copy.) + virtual void getPage(size_t page_index, const unsigned char **begin, + const unsigned char **end, PagePtr *page) const = 0; +}; + +typedef std::shared_ptr<PagedByteArray> PagedByteArrayPtr; + +// Smart pointer that has the same semantics as a "const unsigned char *" (plus +// some convenience functions) but provides range checking and the ability to +// access arrays that are not contiguous in memory or do not reside entirely in +// memory (through the PagedByteArray interface). +// +// In the following, we abbreviate RangeCheckedBytePtr as RCBP. +// +// The intent of this class is to allow easy security hardening of code that +// parses binary data structures using raw byte pointers. To do this, only the +// declarations of the pointers need to be changed; the code that uses the +// pointers can remain unchanged. +// +// If an illegal operation occurs on a pointer, an error flag is set, and all +// read operations from this point on return 0. This means that error checking +// need not be done after every access; it is sufficient to check the error flag +// (using errorOccurred()) once before the RCBP is destroyed. Again, this allows +// the majority of the parsing code to remain unchanged. (Note caveats below +// that apply if a copy of the pointer is created.) +// +// Legal operations are exactly the ones that would be legal on a raw C++ +// pointer. Read accesses are legal if they fall within the underlying array. A +// RCBP may point to any element in the underlying array or one element beyond +// the end of the array. +// +// For brevity, the documentation for individual member functions does not state +// explicitly that the error flag will be set on out-of-range operations. +// +// Note: +// +// - Just as for raw pointers, it is legal for a pointer to point one element +// beyond the end of the array, but it is illegal to use operator*() on such a +// pointer. +// +// - If a copy of an RCBP is created, then performing illegal operations on the +// copy affects the error flag of the copy, but not of the original pointer. +// Note that using operator+ and operator- also creates a copy of the pointer. +// For example: +// +// // Assume we have an RCBP called "p" and a size_t variable called +// // "offset". +// RangeCheckedBytePtr sub_data_structure = p + offset; +// +// If "offset" is large enough to cause an out-of-range access, then +// sub_data_structure.errorOccurred() will be true, but p.errorOccurred() will +// still be false. The error flag for sub_data_structure therefore needs to be +// checked before it is destroyed. +class RangeCheckedBytePtr { + private: + // This class maintains the following class invariants: + // - page_data_ always points to a buffer of at least current_page_len_ + // bytes. + // + // - The current position lies within the sub-array, i.e. + // sub_array_begin_ <= current_pos_ <= sub_array_end_ + // + // - The sub-array is entirely contained within the array, i.e. + // 0 <= sub_array_begin <= sub_array_end <= array_->length() + // + // - If the current page is non-empty, it lies completely within the + // sub-array, i.e. + // if _current_page_len_ >= 0, then + // sub_array_begin_ <= page_begin_offset_ + // and + // page_begin_offset_ + current_page_len_ <= sub_array_end_ + // (See also restrictPageToSubArray().) + // (If _current_page_len_ == 0, then page_begin_offset_ may lie outside + // the sub-array; this condition is harmless. Additional logic would be + // required to make page_begin_offset_ lie within the sub-array in this + // case, and it would serve no purpose other than to make the invariant + // slightly simpler.) + // + // Note that it is _not_ a class invariant that current_pos_ needs to lie + // within the current page. Making this an invariant would have two + // undesirable consequences: + // a) When operator[] is called with an index that lies beyond the end of + // the current page, it would need to temporarily load the page that + // contains this index, but it wouldn't be able to "retain" the page + // (i.e. make it the current page) because that would violate the + // proposed invariant. This would lead to inefficient behavior in the + // case where code accesses a large range of indices beyond the end of + // the page because a page would need to be loaded temporarily on each + // access. + // b) It would require more code: loadPageForOffset() would need to be + // called anywhere that current_pos_ changes (whereas, with the present + // approach, loadPageForOffset() is only called in operator[]). + + // PagedByteArray that is accessed by this pointer. + PagedByteArrayPtr array_; + + // Pointer to the current page. + mutable PagedByteArray::PagePtr page_; + + // Pointer to the current page's data buffer. + mutable const unsigned char *page_data_; + + // All of the following offsets are defined relative to the beginning of + // the array defined by the PagedByteArray array_. + + // Array offset that the pointer points to. + size_t current_pos_; + + // Start offset of the current sub-array. + size_t sub_array_begin_; + + // End offset of the current sub-array. + size_t sub_array_end_; + + // Array offset corresponding to the "page_data_" pointer. + mutable size_t page_begin_offset_; + + // Length of the current page. + mutable size_t current_page_len_; + + // Error flag. This is mutable because methods that don't affect the value + // of the pointer itself (such as operator[]) nevertheless need to be able to + // signal error conditions. + mutable MemoryStatus error_flag_; + + RangeCheckedBytePtr(); + + public: + // Creates a pointer that points to the first element of 'array', which has a + // length of 'len'. The caller must ensure that the array remains valid until + // this pointer and any pointers created from it have been destroyed. + // Note: 'len' may be zero, but 'array' must in this case still be a valid, + // non-null pointer. + explicit RangeCheckedBytePtr(const unsigned char *array, const size_t len); + + // Creates a pointer that points to the first element of the given + // PagedByteArray. The caller must ensure that this PagedByteArray remains + // valid until this pointer and any pointers created from it have been + // destroyed. + explicit RangeCheckedBytePtr(PagedByteArray *array); + + // Creates an invalid RangeCheckedBytePtr. Calling errorOccurred() on the + // result of invalidPointer() always returns true. + // Do not check a RangeCheckedBytePtr for validity by comparing against + // invalidPointer(); use errorOccurred() instead. + static RangeCheckedBytePtr invalidPointer(); + + // Returns a RangeCheckedBytePtr that points to a sub-array of this pointer's + // underlying array. The sub-array starts at position 'pos' relative to this + // pointer and is 'length' bytes long. The sub-array must lie within this + // pointer's array, i.e. pos + length <= remainingLength() must hold. If this + // condition is violated, an invalid pointer is returned. + RangeCheckedBytePtr pointerToSubArray(size_t pos, size_t length) const; + + // Returns the number of bytes remaining in the array from this pointer's + // present position. + inline size_t remainingLength() const; + + // Returns the offset (or index) in the underlying array that this pointer + // points to. If this pointer was created using pointerToSubArray(), the + // offset is relative to the beginning of the sub-array (and not relative to + // the beginning of the original array). + size_t offsetInArray() const; + + // Returns whether an out-of-bounds error has ever occurred on this pointer in + // the past. An error occurs if a caller attempts to read from a position + // outside the bounds of the array or to move the pointer outside the bounds + // of the array. + // + // The error flag is never reset. Once an error has occurred, + // all subsequent attempts to read from the pointer (even within the bounds of + // the array) return 0. + // + // Note that it is permissible for a pointer to point one element past the end + // of the array, but it is not permissible to read from this position. This is + // equivalent to the semantics of raw C++ pointers. + inline bool errorOccurred() const; + + // Returns the substring of length 'length' located at position 'pos' relative + // to this pointer. + std::string substr(size_t pos, size_t length) const; + + // Returns 'length' number of bytes from the array starting at position 'pos' + // relative to this pointer. + std::vector<unsigned char> extractBytes(size_t pos, size_t length) const; + + // Equivalent to calling convert(0, output). + template <class T> + bool convert(T *output) const { + union { + T t; + unsigned char ch[sizeof(T)]; + } buffer; + for (size_t i = 0; i < sizeof(T); i++) { + buffer.ch[i] = (*this)[i]; + } + if (!errorOccurred()) { + *output = buffer.t; + } + return !errorOccurred(); + } + + // Reinterprets this pointer as a pointer to an array of T, then returns the + // element at position 'index' in this array of T. (Note that this position + // corresponds to position index * sizeof(T) in the underlying byte array.) + // + // Returns true if successful; false if an out-of-range error occurred or if + // the error flag was already set on the pointer when calling convert(). + // + // The conversion from a sequence of sizeof(T) bytes to a T is performed in an + // implementation-defined fashion. This conversion is equivalent to the one + // obtained using the following union by filling the array 'ch' and then + // reading the member 't': + // + // union { + // T t; + // unsigned char ch[sizeof(T)]; + // }; + // + // Callers should note that, among other things, the conversion is not + // endian-agnostic with respect to the endianness of T. + template <class T> + bool convert(size_t index, T *output) const { + RangeCheckedBytePtr p = (*this) + index * sizeof(T); + bool valid = p.convert(output); + if (!valid) { + error_flag_ = p.error_flag_; + } + return valid; + } + + // Operators. Unless otherwise noted, these operators have the same semantics + // as the same operators on an unsigned char pointer. + + // If an out-of-range access is attempted, returns 0 (and sets the error + // flag). + inline unsigned char operator[](size_t i) const; + + inline unsigned char operator*() const; + + inline RangeCheckedBytePtr &operator++(); + + inline RangeCheckedBytePtr operator++(int); + + inline RangeCheckedBytePtr &operator--(); + + inline RangeCheckedBytePtr operator--(int); + + inline RangeCheckedBytePtr &operator+=(size_t x); + + inline RangeCheckedBytePtr &operator-=(size_t x); + + inline friend RangeCheckedBytePtr operator+(const RangeCheckedBytePtr &p, + size_t x); + + inline friend RangeCheckedBytePtr operator-(const RangeCheckedBytePtr &p, + size_t x); + + // Tests whether x and y point at the same position in the underlying array. + // Two pointers that point at the same position but have different + // sub-arrays still compare equal. It is not legal to compare two pointers + // that point into different paged byte arrays. + friend bool operator==(const RangeCheckedBytePtr &x, + const RangeCheckedBytePtr &y); + + // Returns !(x == y). + friend bool operator!=(const RangeCheckedBytePtr &x, + const RangeCheckedBytePtr &y); + + private: + void loadPageForOffset(size_t offset) const; + void restrictPageToSubArray() const; +}; + +// Returns the result of calling std::memcmp() on the sequences of 'num' bytes +// pointed to by 'x' and 'y'. The result is undefined if either +// x.remainingLength() or y.remainingLength() is less than 'num'. +int memcmp(const RangeCheckedBytePtr &x, const RangeCheckedBytePtr &y, + size_t num); + +// Returns the result of calling std::memcmp() (note: _not_ strcmp()) on the +// y.length() number of bytes pointed to by 'x' and the string 'y'. The result +// is undefined if x.remainingLength() is less than y.length(). +int strcmp(const RangeCheckedBytePtr &x, const std::string &y); + +// Returns the length of the zero-terminated string starting at 'src' (not +// including the '\0' terminator). If no '\0' occurs before the end of the +// array, the result is undefined. +size_t strlen(const RangeCheckedBytePtr &src); + +// Integer decoding functions. +// +// These functions read signed (Get16s, Get32s) or unsigned (Get16u, Get32u) +// integers from 'input'. The integer read from the input can be specified to be +// either big-endian (big_endian == true) or little-endian +// (little_endian == false). Signed integers are read in two's-complement +// representation. The integer read in the specified format is then converted to +// the implementation's native integer representation and returned. In other +// words, the semantics of these functions are independent of the +// implementation's endianness and signed integer representation. +// +// If an out-of-range error occurs, these functions do _not_ set the error flag +// on 'input'. Instead, they set 'status' to RANGE_CHECKED_BYTE_ERROR and return +// 0. +// +// Note: +// - If an error occurs and 'status' is already set to an error value (i.e. a +// value different from RANGE_CHECKED_BYTE_SUCCESS), the value of 'status' is +// left unchanged. +// - If the operation is successful, 'status' is left unchanged (i.e. it is not +// actively set to RANGE_CHECKED_BYTE_SUCCESS). +// +// Together, these two properties mean that these functions can be used to read +// a number of integers in succession with only a single error check, like this: +// +// MemoryStatus status = RANGE_CHECKED_BYTE_SUCCESS; +// int16 val1 = Get16s(input, false, &status); +// int32 val2 = Get32s(input + 2, false, &status); +// uint32 val3 = Get32u(input + 6, false, &status); +// if (status != RANGE_CHECKED_BYTE_SUCCESS) { +// // error handling +// } +int16 Get16s(const RangeCheckedBytePtr &input, const bool big_endian, + MemoryStatus *status); +uint16 Get16u(const RangeCheckedBytePtr &input, const bool big_endian, + MemoryStatus *status); +int32 Get32s(const RangeCheckedBytePtr &input, const bool big_endian, + MemoryStatus *status); +uint32 Get32u(const RangeCheckedBytePtr &input, const bool big_endian, + MemoryStatus *status); + +size_t RangeCheckedBytePtr::remainingLength() const { + if (!errorOccurred()) { + // current_pos_ <= sub_array_end_ is a class invariant, but protect + // against violations of this invariant. + if (current_pos_ <= sub_array_end_) { + return sub_array_end_ - current_pos_; + } else { + assert(false); + return 0; + } + } else { + return 0; + } +} + +bool RangeCheckedBytePtr::errorOccurred() const { + return error_flag_ != RANGE_CHECKED_BYTE_SUCCESS; +} + +unsigned char RangeCheckedBytePtr::operator[](size_t i) const { + // Check that pointer doesn't have an error flag set. + if (!errorOccurred()) { + // Offset in array to read from. + const size_t read_offset = current_pos_ + i; + + // Check for the common case first: The byte we want to read lies in the + // current page. For performance reasons, we don't check for the case + // "read_offset < page_begin_offset_" explicitly; if it occurs, it will + // lead to wraparound (which is well-defined for unsigned quantities), and + // this will cause the test "pos_in_page < current_page_len_" to fail. + size_t pos_in_page = read_offset - page_begin_offset_; + if (pos_in_page < current_page_len_) { + return page_data_[pos_in_page]; + } + + // Check that the offset we're trying to read lies within the sub-array + // we're allowed to access. + if (read_offset >= sub_array_begin_ && read_offset < sub_array_end_) { + // Read the page that contains the offset "read_offset". + loadPageForOffset(read_offset); + + // Compute the position within the new page from which we need to read. + pos_in_page = read_offset - page_begin_offset_; + + // After the call to loadPageForOffset(), read_offset must lie within + // the current page, and therefore pos_in_page must be less than the + // length of the page. We nevertheless check for this to protect against + // potential bugs in loadPageForOffset(). + assert(pos_in_page < current_page_len_); + if (pos_in_page < current_page_len_) { + return page_data_[pos_in_page]; + } + } + } + +// All error cases fall through to here. +#ifdef BREAK_IF_DEBUGGING_AND_OUT_OF_RANGE + assert(false); +#endif + error_flag_ = RANGE_CHECKED_BYTE_ERROR_OVERFLOW; + // return 0, which represents the invalid value + return static_cast<unsigned char>(0); +} + +unsigned char RangeCheckedBytePtr::operator*() const { return (*this)[0]; } + +RangeCheckedBytePtr &RangeCheckedBytePtr::operator++() { + if (current_pos_ < sub_array_end_) { + current_pos_++; + } else { +#ifdef BREAK_IF_DEBUGGING_AND_OUT_OF_RANGE + assert(false); +#endif + error_flag_ = RANGE_CHECKED_BYTE_ERROR_OVERFLOW; + } + return *this; +} + +RangeCheckedBytePtr RangeCheckedBytePtr::operator++(int) { + RangeCheckedBytePtr result(*this); + ++(*this); + return result; +} + +RangeCheckedBytePtr &RangeCheckedBytePtr::operator--() { + if (current_pos_ > sub_array_begin_) { + current_pos_--; + } else { +#ifdef BREAK_IF_DEBUGGING_AND_OUT_OF_RANGE + assert(false); +#endif + error_flag_ = RANGE_CHECKED_BYTE_ERROR_UNDERFLOW; + } + return *this; +} + +RangeCheckedBytePtr RangeCheckedBytePtr::operator--(int) { + RangeCheckedBytePtr result(*this); + --(*this); + return result; +} + +RangeCheckedBytePtr &RangeCheckedBytePtr::operator+=(size_t x) { + if (remainingLength() >= x) { + current_pos_ += x; + } else { +#ifdef BREAK_IF_DEBUGGING_AND_OUT_OF_RANGE + assert(false); +#endif + error_flag_ = RANGE_CHECKED_BYTE_ERROR_OVERFLOW; + } + return *this; +} + +RangeCheckedBytePtr &RangeCheckedBytePtr::operator-=(size_t x) { + if (x <= current_pos_ - sub_array_begin_) { + current_pos_ -= x; + } else { +#ifdef BREAK_IF_DEBUGGING_AND_OUT_OF_RANGE + assert(false); +#endif + error_flag_ = RANGE_CHECKED_BYTE_ERROR_UNDERFLOW; + } + return *this; +} + +RangeCheckedBytePtr operator+(const RangeCheckedBytePtr &p, size_t x) { + RangeCheckedBytePtr result(p); + result += x; + return result; +} + +RangeCheckedBytePtr operator-(const RangeCheckedBytePtr &p, size_t x) { + RangeCheckedBytePtr result(p); + result -= x; + return result; +} + +} // namespace binary_parse +} // namespace piex + +#endif // PIEX_BINARY_PARSE_RANGE_CHECKED_BYTE_PTR_H_ |